-
-
Notifications
You must be signed in to change notification settings - Fork 311
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3791f27
commit 52af1a4
Showing
3 changed files
with
106 additions
and
228 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,214 +1,104 @@ | ||
import json | ||
import re | ||
from datetime import datetime | ||
from io import BytesIO, StringIO | ||
|
||
import scrapy | ||
from city_scrapers_core.constants import BOARD, NOT_CLASSIFIED | ||
from city_scrapers_core.items import Meeting | ||
from city_scrapers_core.spiders import CityScrapersSpider | ||
from pdfminer.high_level import extract_text_to_fp | ||
from pdfminer.layout import LAParams | ||
|
||
|
||
class IlPollutionControlSpider(CityScrapersSpider): | ||
name = "il_pollution_control" | ||
agency = "Illinois Pollution Control Board" | ||
timezone = "America/Chicago" | ||
domain = "https://pcb.illinois.gov" | ||
start_urls = [ | ||
"https://pcb.illinois.gov/ClerksOffice/MeetingMinutes", | ||
"https://pcb.illinois.gov/CurrentAgendas", | ||
domain + "/ClerksOffice/GetCalendarEvents", | ||
] | ||
calendar_page = "https://pcb.illinois.gov/ClerksOffice/Calendar" | ||
default_links = [ | ||
{ | ||
"title": "Agendas", | ||
"href": "https://pcb.illinois.gov/CurrentAgendas", | ||
}, | ||
{ | ||
"title": "Meeting minutes", | ||
"href": "https://pcb.illinois.gov/ClerksOffice/MeetingMinutes", | ||
}, | ||
] | ||
json_url = "https://pcb.illinois.gov/ClerksOffice/GetCalendarEvents" | ||
calendar_url = "https://pcb.illinois.gov/ClerksOffice/Calendar" | ||
|
||
def __init__(self, *args, **kwargs): | ||
self.minutes_map = dict() # Populated by self._parse_minutes() | ||
self.agenda_map = dict() # Populated by self._parse_agenda() | ||
self.relevant_years = [ | ||
str(y) for y in range(datetime.now().year - 1, datetime.now().year + 1) | ||
] | ||
super().__init__(*args, **kwargs) | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler, *args, **kwargs): | ||
"""Overridden `from_crawler` to connect `spider_idle` signal.""" | ||
spider = super().from_crawler(crawler, *args, **kwargs) | ||
crawler.signals.connect(spider.spider_idle, signal=scrapy.signals.spider_idle) | ||
return spider | ||
|
||
def spider_idle(self): | ||
""" | ||
React to `spider_idle` signal by starting JSON parsing after _parse_minutes. | ||
""" | ||
self.crawler.signals.disconnect( | ||
self.spider_idle, signal=scrapy.signals.spider_idle | ||
) | ||
self.crawler.engine.crawl( | ||
scrapy.Request(self.json_url, callback=self._parse_json), self | ||
) | ||
raise scrapy.exceptions.DontCloseSpider | ||
|
||
def parse(self, response): | ||
""" | ||
`parse` should always `yield` Meeting items. | ||
""" | ||
# Gather and store links to meeting minutes: | ||
for item in response.xpath("//iframe/@src"): | ||
yield scrapy.Request(item.get(), callback=self._parse_minutes) | ||
|
||
# Gather and store link to agenda: | ||
for agenda_url in self._parse_agenda_page(response): | ||
yield scrapy.Request(agenda_url, callback=self._parse_agenda) | ||
|
||
def _parse_minutes(self, response): | ||
"""Traverse tree of URLs and populate self.minutes_map""" | ||
for item in response.xpath("//td[@class='name']/a"): | ||
try: | ||
href = item.xpath("@href")[0].get() | ||
text = item.xpath("b/text()")[0].get().strip() | ||
if not any([(year in text) for year in self.relevant_years]): | ||
continue # Link does not contain documents from recent years | ||
if text[-4:] == ".pdf": | ||
text = text[:-4] | ||
except IndexError: | ||
continue | ||
|
||
url = response.urljoin(href) | ||
if ".pdf" not in url: | ||
# Not a link to meeting minutes file - go a level deeper | ||
yield scrapy.Request(url, callback=self._parse_minutes) | ||
else: | ||
# Dates are given in several formats: | ||
format_strs = ["%m-%d-%Y", "%m-%d-%y", "%m/%d/%Y", "%m/%d/%y"] | ||
dt = None | ||
for format_str in format_strs: | ||
try: | ||
dt = datetime.strptime(text, format_str).date() | ||
except ValueError: | ||
continue | ||
else: | ||
break # Found a format_str that matches - stop looking | ||
if dt is None: | ||
continue # Could not find matching format_str - can't process link. | ||
|
||
self.minutes_map[dt] = url | ||
|
||
def _parse_agenda_page(self, response): | ||
"""Scrape link to agenda PDF""" | ||
for item in response.xpath("//div/div/a"): | ||
for _ in item.xpath(".//div/h5[text()='Board Meeting']"): | ||
for href in item.xpath("./@href"): | ||
yield href.get() | ||
|
||
def _parse_agenda(self, response): | ||
"""Parse PDF with agenda for date and store link + date""" | ||
# pdf_obj = PdfFileReader(BytesIO(response.body)) | ||
# pdf_text = pdf_obj.getPage(0).extractText().replace("\n", "") | ||
lp = LAParams(line_margin=0.1) | ||
out_str = StringIO() | ||
extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp) | ||
pdf_text = out_str.getvalue().replace("\n", "") | ||
|
||
# Find and extract strings for month/day/year: | ||
regex = re.compile(r"(?P<month>[a-zA-Z]+) (?P<day>[0-9]+), (?P<year>[0-9]{4})") | ||
m = regex.search(pdf_text) | ||
|
||
try: | ||
month = datetime.strptime(m.group("month"), "%B").month | ||
day = int(m.group("day")) | ||
year = int(m.group("year")) | ||
self.agenda_map[datetime(year, month, day).date()] = response.url | ||
except AttributeError: # Regex failed to match. | ||
return None | ||
|
||
return None | ||
|
||
def _parse_json(self, response): | ||
""" | ||
Parse JSON from /ClerksOffice/GetCalendarEvents -> Meetings | ||
""" | ||
data = json.loads(response.text) | ||
|
||
for item in data: | ||
if any( | ||
s in item["CalendarTypeDesc"].lower() | ||
for s in ("holiday", "seminar", "hearing") | ||
): | ||
continue # Not interested in this event type | ||
|
||
title = item["CalendarTypeDesc"].replace("CANCELLED", "").strip() | ||
title = item.get("CalendarTypeDesc") | ||
if not title or "holiday" in title.lower(): | ||
continue | ||
meeting = Meeting( | ||
title=title, | ||
description="", # Too inconsistent to parse accurately | ||
classification=self._parse_classification(title), | ||
start=self._parse_start(item), | ||
end=None, | ||
all_day=item["IsFullDay"], | ||
description=self._parse_description(item), | ||
classification=self._parse_classification(item), | ||
start=self._parse_datetime(item.get("StartDateTime")), | ||
end=self._parse_datetime(item.get("EndDateTime")), | ||
all_day=item.get("IsFullDay"), | ||
time_notes="", | ||
location=self._parse_location(item), | ||
links=list(), | ||
source=self._parse_source(item, response), | ||
) | ||
|
||
meeting["links"] = self._parse_links(meeting) | ||
meeting["status"] = self._get_status( | ||
meeting, | ||
text=" ".join([item["CalendarTypeDesc"], item["Description"]]).lower(), | ||
links=self._parse_links(item), | ||
source=self.calendar_page, | ||
) | ||
meeting["status"] = self._get_status(meeting, text=item.get("Cancelled")) | ||
meeting["id"] = self._get_id(meeting) | ||
|
||
yield meeting | ||
|
||
def _parse_classification(self, title): | ||
"""Parse or generate classification from allowed options.""" | ||
if "Board" in title: | ||
return BOARD | ||
else: | ||
return NOT_CLASSIFIED | ||
def _parse_datetime(self, date_str): | ||
"""Parse the datetime from the string format in the JSON""" | ||
if date_str: | ||
return datetime.strptime(date_str, "%m/%d/%Y %I:%M:%S %p") | ||
return None | ||
|
||
def _parse_start(self, item): | ||
return datetime.strptime(item["StartDateTime"], "%m/%d/%Y %I:%M:%S %p") | ||
def _parse_description(self, item): | ||
""" | ||
Extract and clean text from HTML description using Scrapy selectors, | ||
removing hidden characters and non-standard whitespace. | ||
""" | ||
description_html = item.get("Description", "") | ||
selector = scrapy.Selector(text=description_html) | ||
text_lines = selector.xpath("//text()").extract() | ||
clean_text = " ".join(line.strip() for line in text_lines if line.strip()) | ||
# Using regex to remove non-printable characters and other unwanted symbols | ||
clean_description = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", clean_text) | ||
return clean_description | ||
|
||
def _parse_classification(self, item): | ||
if "Board" in item.get("CalendarTypeDesc", ""): | ||
return BOARD | ||
return NOT_CLASSIFIED | ||
|
||
def _parse_location(self, item): | ||
"""Parse or generate location.""" | ||
text = " ".join([item["Description"], item["Location"]]).lower() | ||
if "thompson" in text: | ||
if item.get("Location"): | ||
return { | ||
"address": "James R. Thompson Center - 100 W. Randolph St. Suite 11-500, Chicago, IL 60601", # noqa | ||
"name": "Chicago IPCB Office", | ||
} | ||
elif "springfield" in text or "llinois pollution control board" in text: | ||
return { | ||
"address": "1021 N. Grand Ave. E. - Room 1244 N, Springfield, IL 62702", | ||
"name": "Springfield IPCB Office", | ||
} | ||
elif "sangamo room" in text: | ||
return { | ||
"address": "1021 N. Grand Ave. E. - Sangamo Room, Springfield, IL 62702", # noqa | ||
"name": "Illinois EPA", | ||
} | ||
else: | ||
return { | ||
"address": "", | ||
"name": "", | ||
"address": item["Location"].strip(), | ||
} | ||
|
||
def _parse_links(self, meeting): | ||
"""Associate Meeting objects with previously-scraped links""" | ||
links = list() | ||
key = meeting["start"].date() | ||
if key in self.minutes_map: | ||
links.append({"href": self.minutes_map[key], "title": "Minutes"}) | ||
if key in self.agenda_map: | ||
links.append({"href": self.agenda_map[key], "title": "Agenda"}) | ||
|
||
return links | ||
|
||
def _parse_source(self, item, response): | ||
"""Parse or generate source.""" | ||
rel_url = scrapy.Selector(text=item["Description"]).xpath(".//a/@href").get() | ||
if rel_url: | ||
return response.urljoin(rel_url) | ||
else: | ||
return self.calendar_url | ||
return {"name": "No location provided", "address": ""} | ||
|
||
def _parse_links(self, item): | ||
"""Parse links from description.""" | ||
description_html = item.get("Description") | ||
selector = scrapy.Selector(text=description_html) | ||
a_tags = selector.css("a") | ||
links = [] | ||
for a_tag in a_tags: | ||
# check if href is relative or absolute and prefix domain if needed | ||
href = a_tag.attrib.get("href") | ||
href_clean = href if href.startswith("http") else self.domain + href | ||
title = a_tag.attrib.get("title") | ||
clean_title = title if title else "Related document" | ||
link = { | ||
"href": href_clean, | ||
"title": clean_title, | ||
} | ||
links.append(link) | ||
final_links = self.default_links + links | ||
return final_links |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.