forked from Gafgharion/Initiative-Tracker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
statblock_scraper.py
141 lines (115 loc) · 4.86 KB
/
statblock_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import aiohttp
import webbrowser
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
async def get_statblock_html(creature_name):
if " " in creature_name:
creature_name = creature_name.replace(" ", "-")
creature_name = creature_name.lower().strip("#").strip("+")
# URL to scrape
url = f"https://5ecompendium.github.io/bestiary/creature/{creature_name}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
# Check if request was successful
if response.status == 200:
# Parse the HTML content
return BeautifulSoup(await response.text(), "html.parser")
else:
print(f"Failed to retrieve page. Status code: {response.status}")
return None
async def get_statblock(creature_name):
# Get the HTML content
soup = await get_statblock_html(creature_name)
if soup is None:
return None
try:
# Extract creature name and type
creature_name = (
soup.find("h1").text.strip() if soup.find("h1") else "Unknown Creature"
)
creature_type = (
soup.find("h2").text.strip() if soup.find("h2") else "Unknown Type"
)
# Extract abilities
abilities = {}
for ability in soup.find_all("div", class_="abilities"):
for stat in ability.find_all("div"):
ability_name = (
stat.find("h4").text.strip() if stat.find("h4") else "Unknown"
)
ability_score = stat.find("p").text.strip() if stat.find("p") else "N/A"
abilities[ability_name] = ability_score
# Extract special traits
important_info = {}
for trait in soup.find_all("div", class_="property-line"):
trait_name = (
trait.find("h4").text.strip() if trait.find("h4") else "Unknown"
)
trait_value = trait.find("p").text.strip() if trait.find("p") else "N/A"
important_info[trait_name] = trait_value
# Function to safely extract text or return a default message
def safe_extract(header_id):
header = soup.find("h3", id=header_id)
if header and header.find_next("p"):
return header.find_next("p").text.strip()
return "Not available"
# Extract actions, reactions, and legendary actions
actions = safe_extract("actions")
reactions = safe_extract("reactions")
legendary_actions = safe_extract("legendary-actions")
# Compile the statblock
monster_statblock = {
"creature_name": creature_name,
"creature_type": creature_type,
"ability_scores": abilities,
"important_info": important_info,
"actions": actions,
"reactions": reactions,
"legendary_actions": legendary_actions,
}
return monster_statblock
except Exception as e:
print(f"Error parsing statblock: {e}")
return None
def open_statblock(creature_type, driver):
creature_type = strip_numbers(creature_type)
url = f"https://5ecompendium.github.io/bestiary/creature/{creature_type}"
# Specify the path to the GeckoDriver executable
if driver is None:
gecko_driver_path = "./utils/gecko_driver/geckodriver.exe" # Change this to your geckodriver path
# Start a Selenium WebDriver session with Firefox
service = Service(executable_path=gecko_driver_path)
driver = webdriver.Firefox(service=service)
# Check if the URL is already open in any tab
found = False
for handle in driver.window_handles:
driver.switch_to.window(handle)
if driver.current_url == url:
found = True
break
# If not found, open the URL in a new tab
if not found:
driver.get(url)
# Optionally, wait for a specific element to ensure the page is loaded
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
except TimeoutException:
print("Page load timed out!")
# Focus back on the found or newly opened tab
if found:
driver.switch_to.window(handle)
return driver
def strip_numbers(creature_type):
creature_type_clean = re.sub(r"\d+", "", creature_type)
if " " in creature_type_clean:
creature_type_clean = creature_type_clean.replace(" ", "-")
creature_type_clean = creature_type_clean.lower().strip("#").strip("+")
return creature_type_clean