diff --git a/timetable/extractor.py b/timetable/extractor.py index 0aaa79f..46fd8ff 100644 --- a/timetable/extractor.py +++ b/timetable/extractor.py @@ -7,6 +7,7 @@ with open("full_location.json") as data_file: full_locations = json.load(data_file) +DAYS_MAP = {'Mon': "Monday", 'Tue': "Tuesday", 'Wed': "Wednesday", 'Thur': "Thursday", 'Fri': "Friday", 'Sat': "Saturday"} @dataclass class Course: @@ -36,6 +37,25 @@ def title(self) -> str: def end_time(self) -> int: return self.duration + self.start_time +def create_timings(_table: Tag | NavigableString) -> list[int]: + r""" Creates a list of timings in 24 hours format - [8, ..., 12, 14, ..., 17]""" + headings = _table.find_all( + 'td', + { + 'class': 'tableheader', + 'style': 'padding-top:5px;padding-bottom:5px;padding-left:7px;padding-right:7px', + 'nowrap': '' + } + ) + + get_hour = lambda t: int(t.get_text().split(':')[0]) + + return [ + get_hour(time) + 12 if + get_hour(time) < 6 else + get_hour(time) for + time in headings if time.get_text() != 'Day Name' + ] def build_courses(html: str, course_names: dict) -> list[Course]: r""" @@ -48,71 +68,65 @@ def build_courses(html: str, course_names: dict) -> list[Course]: list of Course objects """ courses = [] - days = {'Mon': "Monday", 'Tue': "Tuesday", 'Wed': "Wednesday", 'Thur': "Thursday", 'Fri': "Friday", - 'Sat': "Saturday"} soup = BeautifulSoup(html, 'html.parser') table = soup.find('table', {'border': '1', 'cellpadding': '0', 'cellspacing': '0'}) - def create_timings(_table: Tag | NavigableString) -> list[int]: - r""" Creates a list of timings in 24 hours format - [8, ..., 12, 14, ..., 17]""" - headings = _table.find_all('td', {'class': 'tableheader', - 'style': 'padding-top:5px;padding-bottom:5px;padding-left:7px;padding-right' - ':7px', 'nowrap': ''}) - return [int(time.get_text().split(':')[0]) + 12 if int(time.get_text().split(':')[0]) < 6 else int( - time.get_text().split(':')[0]) for time in headings if time.get_text() != 'Day Name'] - timings = create_timings(table) rows = table.find_all('tr') for row in rows: day_cell: PageElement = row.find('td', {'valign': 'top'}) if not day_cell: continue - day = days[day_cell.get_text()] if day_cell.get_text() in days.keys() else day_cell.get_text() + day = DAYS_MAP.get(day_cell.get_text(), day_cell.get_text()) - # Merge timeslots occurring adjacent to each other and initialize course objects + # This is the previously parsed course/time slot. Used to merge timeslots occurring adjacent to each other and initialize course objects prev: Course | None = None - course_duration: int = 0 cells = [cell for cell in row.find_all('td') if cell.attrs.get('valign') != 'top'] - for index, cell in enumerate(cells): - code = cell.get_text()[:7].strip() if cell.get_text()[:7] != "CS10001" else "CS10003" - if not code: continue # continue if cell has no course in it - # CS10003 is the actual code, but it is written as CS10001 in the timetable - location = cell.get_text()[7:] - cell_duration = int(cell.attrs.get('colspan')) - # To reuse code, uses outer scope variables - def append_prev(): - prev.duration = course_duration - courses.append(prev) + for index, cell in enumerate(cells): # Empty cell means either previous class just ended or no class is scheduled - if cell.get_text() == ' ' or cell.get_text() == ' ': + if cell.get_text().strip() == "": if prev: - append_prev() - course_duration = 0 - prev = None + # Previous slot ended + courses.append(prev) + prev = None continue + code = cell.get_text()[:7].strip() + if not code: continue # continue if cell has no course in it + + # Special exception: CS10003 is the actual code, but it is written as CS10001 in the timetable + if code == "CS10001": + code = "CS10003" + + location = cell.get_text()[7:] + cell_duration = int(cell.attrs.get('colspan')) + # Either new class started just after previous one or previous class is continued - if prev: - # If previous class is same as current class, add the duration to the previous class + if prev is not None: if code == prev.code: - course_duration += cell_duration - # The previous class is different from the current one, meaning previous class ended. Clean up, add to - # list and initialize a new course object for the current class + # Previous class is same as current class, add the duration to the previous class + prev.duration += cell_duration else: - append_prev() - prev = Course(code=code, name=course_names[code], day=day, - start_time=timings[index], - location=location) - course_duration = cell_duration - # New class started after break - else: - prev = Course(code=code, name=course_names[code], day=day, - start_time=timings[index], - location=location) - course_duration = cell_duration - - # Day ended, add the last class - if index == len(cells) - 1: - append_prev() + # The previous class is different from the current one, meaning previous class ended. Clean up, add to + # list and initialize a new course object for the current class + courses.append(prev) + prev = None + + # Either new class started after break or after a different previous course + if prev is None: + prev = Course( + code=code, + name=course_names[code], + day=day, + start_time=timings[index], + location=location, + duration=cell_duration + ) + + # Day ended, add the last class + if prev is not None: + courses.append(prev) + prev = None + return courses