From e65684611f51c6e80407f233f274a4da37369017 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Sun, 17 Dec 2023 11:00:13 -0800 Subject: [PATCH 01/19] Implement naive MARC fetcher --- metadata_fetcher/fetchers/marc_fetcher.py | 23 +++++++++++++++++++++++ metadata_fetcher/requirements.txt | 1 + 2 files changed, 24 insertions(+) create mode 100644 metadata_fetcher/fetchers/marc_fetcher.py diff --git a/metadata_fetcher/fetchers/marc_fetcher.py b/metadata_fetcher/fetchers/marc_fetcher.py new file mode 100644 index 000000000..7bb41daf2 --- /dev/null +++ b/metadata_fetcher/fetchers/marc_fetcher.py @@ -0,0 +1,23 @@ +import requests + +from .Fetcher import Fetcher +import json +import pymarc + + +class MarcFetcher(Fetcher): + def __init__(self, params: dict[str]): + super(MarcFetcher, self).__init__(params) + self.url = params.get("harvest_data").get("url") + + def build_fetch_request(self) -> dict[str]: + return {"url": self.url} + + def check_page(self, http_resp: requests.Response) -> int: + return sum(1 for _ in pymarc.MARCReader(http_resp.content, + to_unicode=True, + utf8_handling="replace")) + + def json(self) -> str: + return json.dumps({"finished": True}) + diff --git a/metadata_fetcher/requirements.txt b/metadata_fetcher/requirements.txt index c3639dff7..2e8ea8bb8 100644 --- a/metadata_fetcher/requirements.txt +++ b/metadata_fetcher/requirements.txt @@ -3,3 +3,4 @@ requests sickle python-dotenv beautifulsoup4 +pymarc From b8463b99b9ea6c47edf23c531ba90fde55411b8b Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Sun, 31 Dec 2023 15:28:28 -0800 Subject: [PATCH 02/19] [WIP] Naive implementation of MARC parsing --- metadata_mapper/lambda_function.py | 3 +- metadata_mapper/mappers/marc/marc_mapper.py | 18 ++++ .../mappers/marc/ucb_tind_mapper.py | 94 +++++++++++++++++++ metadata_mapper/mappers/oai/oai_mapper.py | 55 ++++++----- 4 files changed, 146 insertions(+), 24 deletions(-) create mode 100644 metadata_mapper/mappers/marc/marc_mapper.py create mode 100644 metadata_mapper/mappers/marc/ucb_tind_mapper.py diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index ac558d80e..0b90c2705 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -29,15 +29,16 @@ def import_vernacular_reader(mapper_type): f".mappers.{'.'.join(mapper_parent_modules)}.{snake_cased_mapper_name}_mapper", package=__package__ ) - mapper_type_words = snake_cased_mapper_name.split('_') class_type = ''.join([word.capitalize() for word in mapper_type_words]) + vernacular_class = getattr( mapper_module, f"{class_type}Vernacular") if not issubclass(vernacular_class, Vernacular): print(f"{mapper_type} not a subclass of Vernacular", file=sys.stderr) exit() + return vernacular_class diff --git a/metadata_mapper/mappers/marc/marc_mapper.py b/metadata_mapper/mappers/marc/marc_mapper.py new file mode 100644 index 000000000..e4cbe4500 --- /dev/null +++ b/metadata_mapper/mappers/marc/marc_mapper.py @@ -0,0 +1,18 @@ +from ..oai.oai_mapper import OaiVernacular +from ..mapper import Record + +from lxml import etree +from sickle import models + +from pprint import pprint + +class MarcRecord(Record): + def UCLDC_map(self): + # pprint(self.source_metadata) + return { + + } + + +class MarcVernacular(OaiVernacular): + pass diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py new file mode 100644 index 000000000..7b4b957e1 --- /dev/null +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -0,0 +1,94 @@ +from .marc_mapper import MarcRecord, MarcVernacular + +from sickle import models +from pymarc import parse_xml_to_array +from lxml import etree +from io import StringIO + + +class UcbTindRecord(MarcRecord): + def UCLDC_map(self): + print({ + "calisphere-id": self.legacy_couch_db_id.split("--")[1], + "isShownAt": self.map_is_shown_at, + "isShownBy": self.map_is_shown_by, + "title": self.get_marc_field("245", subfield_key="a"), + + # lambda_shepherd misreports that `marc.ucb_tind not yet implemented` if + # this isn't here + "collection": ["collections1", "collections2"], + "identifier": ["identifier1", "identifier2"] + }) + + def map_is_shown_at(self): + """ + Can we identify is_shown_at by something about the URL format? + :return: + """ + return self.get_marc_field("856", subfield_key="u") + + def map_is_shown_by(self): + """ + Can we identify is_shown_by by something about the URL format? + :return: + """ + return self.get_marc_field("856", subfield_key="u") + + def get_marc_field(self, field_key: str, subfield_key: str = None): + fields = self.source_metadata.get("fields") + if not fields: + return + matching_fields = [] + for field in fields: + fk, fv = list(field.items())[0] + if field_key == fk: + matching_fields.append(fv) + + if not matching_fields: + return [] + + subfield_values = [] + for field in matching_fields: + subfields = field.get("subfields") + for subfield in subfields: + sk, sv = list(subfield.items())[0] + if subfield_key == sk or subfield_key is None: + subfield_values.append(sv) + return subfield_values + + +class UcbTindVernacular(MarcVernacular): + record_cls = UcbTindRecord + + def _process_record(self, record_element: list, request_url: str) -> UcbTindRecord: + """ + Process a record element and extract relevant information. + + :param record_element: Element representing a single record. + :param request_url: The URL of the request. + :return: A dictionary containing the extracted information from the record. + """ + marc_record_element = record_element.find(".//marc:record", namespaces={ + "marc": "http://www.loc.gov/MARC21/slim"}) + marc_record_string = etree.tostring(marc_record_element, + encoding="utf-8").decode("utf-8") + + # Wrap the record in collection so pymarc can read it + marc_collection_xml_full = \ + ('' + f'{marc_record_string}' + '') + + record = parse_xml_to_array(StringIO(marc_collection_xml_full))[0].as_dict() + + sickle_rec = models.Record(record_element) + sickle_header = sickle_rec.header + + if sickle_header.deleted: + return None + + record["datestamp"] = sickle_header.datestamp + record["id"] = sickle_header.identifier + record["request_url"] = request_url + + return record diff --git a/metadata_mapper/mappers/oai/oai_mapper.py b/metadata_mapper/mappers/oai/oai_mapper.py index a445070a2..2f72986a4 100644 --- a/metadata_mapper/mappers/oai/oai_mapper.py +++ b/metadata_mapper/mappers/oai/oai_mapper.py @@ -1,5 +1,6 @@ -from typing import Union +from typing import Union, Optional +import requests from lxml import etree from sickle import models @@ -131,38 +132,46 @@ def map_type(self) -> list: class OaiVernacular(Vernacular): + namespaces = {"oai2": "http://www.openarchives.org/OAI/2.0/"} - def parse(self, api_response): + def parse(self, api_response: requests.Response) -> list[Record]: api_response = bytes(api_response, "utf-8") - namespace = {"oai2": "http://www.openarchives.org/OAI/2.0/"} page = etree.XML(api_response) - request_elem = page.find("oai2:request", namespace) - if request_elem is not None: - request_url = request_elem.text - else: - request_url = None + record_elements = self._get_record_elements(page) + request_url = self._get_request_url(page) + records = self._create_records(record_elements, request_url) - record_elements = ( + return self.get_records(records) + + def _get_record_elements(self, page: etree.ElementBase) -> list[etree.ElementBase]: + return ( page - .find("oai2:ListRecords", namespace) - .findall("oai2:record", namespace) + .find("oai2:ListRecords", namespaces=self.namespaces) + .findall("oai2:record", namespaces=self.namespaces) ) - records = [] - for re in record_elements: - sickle_rec = models.Record(re) - sickle_header = sickle_rec.header - if sickle_header.deleted: - continue + def _get_request_url(self, page: etree.ElementBase) -> Optional[str]: + request_elem = page.find("oai2:request", namespaces=self.namespaces) + return request_elem.text if request_elem is not None else None - record = self.strip_metadata(sickle_rec.metadata) - record["datestamp"] = sickle_header.datestamp - record["id"] = sickle_header.identifier - record["request_url"] = request_url - records.append(record) + def _create_records(self, record_elements: list[etree.ElementBase], + request_url: str) -> list[Record]: + return [self._process_record(re, request_url) for re in record_elements] - return self.get_records(records) + def _process_record(self, record_element, request_url): + sickle_rec = models.Record(record_element) + sickle_header = sickle_rec.header + + if sickle_header.deleted: + return None + + record = self.strip_metadata(sickle_rec.metadata) + record["datestamp"] = sickle_header.datestamp + record["id"] = sickle_header.identifier + record["request_url"] = request_url + + return record def strip_metadata(self, record_metadata): stripped = {} From aac6047ff595c0d0d2620a4544a40a92e928d6b8 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Wed, 17 Jan 2024 10:50:18 -0800 Subject: [PATCH 03/19] [WIP] Sleep for 1 sec between collections --- metadata_fetcher/lambda_function.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index 28a66cf45..1eff42c89 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -2,6 +2,7 @@ import json import logging import sys +import time from .fetchers.Fetcher import Fetcher from rikolti.utils.versions import create_vernacular_version @@ -21,7 +22,7 @@ def import_fetcher(harvest_type): # AWS Lambda entry point -def fetch_collection(payload, vernacular_version) -> list[dict]: +def fetch_collection(payload, vernacular_version, context, sleep=False) -> list[dict]: """ returns a list of dicts with the following keys: document_count: int @@ -38,6 +39,23 @@ def fetch_collection(payload, vernacular_version) -> list[dict]: payload.update({'vernacular_version': vernacular_version}) next_page = payload fetch_status = [] + try: + if sleep: + print("Sleeping!") + time.sleep(1) + print("Done Sleeping!") + fetcher = fetcher_class(payload) + fetch_status.append(fetcher.fetch_page()) + except InvalidHarvestEndpoint as e: + logger.error(e) + fetch_status.append({ + 'status': 'error', + 'body': json.dumps({ + 'error': repr(e), + 'payload': payload + }) + }) + return fetch_status while not next_page.get('finished'): fetcher = fetcher_class(next_page) From d21d33b9ccf8870abab71152da7e1724a67e236d Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Wed, 17 Jan 2024 10:51:26 -0800 Subject: [PATCH 04/19] [WIP] Implement approved marc data extraction mechanism --- .../mappers/marc/ucb_tind_mapper.py | 163 +++++++++++++----- 1 file changed, 116 insertions(+), 47 deletions(-) diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index 7b4b957e1..9bde4f982 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -1,4 +1,5 @@ from .marc_mapper import MarcRecord, MarcVernacular +from ..oai.oai_mapper import OaiVernacular from sickle import models from pymarc import parse_xml_to_array @@ -8,56 +9,123 @@ class UcbTindRecord(MarcRecord): def UCLDC_map(self): - print({ + return { "calisphere-id": self.legacy_couch_db_id.split("--")[1], - "isShownAt": self.map_is_shown_at, - "isShownBy": self.map_is_shown_by, - "title": self.get_marc_field("245", subfield_key="a"), + "isShownAt": self.get_marc_values(["856"], ["u"]), + "isShownBy": self.get_marc_values(["856"], ["u"]), + "language": self.get_marc_values(["041"], ["a"]), + "date": self.get_marc_values(["260"], ["c"]), + "publisher": self.get_marc_values(["260"], ["a", "b"]), + "format": self.get_marc_values(["337", "338", "340"], ["a"]), + "extent": self.map_extent, + "identifier": self.get_marc_values(["020", "022", "035"], ["a"]), + "creator": self.get_marc_values(["100", "110", "111"]), + "relation": self.map_relation, + "description": self.map_description, + "rights": self.get_marc_values(["506", "540"]), + "temporal": self.get_marc_values(["648"]), + "contributor": self.get_marc_values(["700", "710", "711", "720"]), + "title": self.map_title, + "spatial": self.map_spatial, + } - # lambda_shepherd misreports that `marc.ucb_tind not yet implemented` if - # this isn't here - "collection": ["collections1", "collections2"], - "identifier": ["identifier1", "identifier2"] - }) + def get_metadata_fields(self): + return [str(i) for i in [600, 630, 650, 651] + list(range(610, 620)) + list( + range(653, 659)) + list(range(690, 700))] - def map_is_shown_at(self): + def map_spatial(self) -> list: + f651 = self.get_marc_values(["651"], ["a"]) + + return f651 + self.get_marc_values(self.get_metadata_fields(), ["z"]) + + def map_subject(self) -> list: + return self.get_marc_values(self.get_metadata_fields()) + + def map_temporal(self) -> list: + f648 = self.get_marc_values(["648"]) + + return f648 + self.get_marc_values(self.get_metadata_fields(), ["y"]) + + def map_format(self) -> list: + return self.get_marc_values(self.get_metadata_fields(), ["v"]) + + def map_description(self) -> list: + field_range = [str(i) for i in range(500, 600) if i != 538] + + return self.get_marc_values(field_range) + + def map_relation(self) -> list: + field_range = [str(i) for i in range(760, 788)] # Up to 787 + + self.get_marc_values(field_range) + + def map_identifier(self) -> list: + f050 = self.get_marc_values(["050"], ["a", "b"]) + + return f050 + self.get_marc_values(["020", "022", "035"], ["a"]) + + def map_extent(self) -> list: """ - Can we identify is_shown_at by something about the URL format? - :return: + Retrieves the extent values from MARC field 300 and 340. + + :return: A list of extent values. """ - return self.get_marc_field("856", subfield_key="u") + return self.get_marc_values(["300"]) + self.get_marc_values(["340"], ["b"]) + + def map_title(self): + # 245, all subfields except c + f245 = self.get_marc_values(["245"], ["c"], exclude_subfields=True) + + # 242, all subfields + f242 = self.get_marc_values(["242"]) + + # 240, all subfields + f240 = self.get_marc_values(["240"]) - def map_is_shown_by(self): + return f245 + f242 + f240 + + def get_marc_values(self, field_tags: list, subfield_codes=[], **kwargs) -> list: """ - Can we identify is_shown_by by something about the URL format? - :return: + Get the values of specified subfields from given MARC fields. + + :param field_tags: A list of MARC fields. + :param subfield_codes: A list of subfield codes to filter the values. If empty, all subfields will be included. + :return: A list of values of the specified subfields. """ - return self.get_marc_field("856", subfield_key="u") - - def get_marc_field(self, field_key: str, subfield_key: str = None): - fields = self.source_metadata.get("fields") - if not fields: - return - matching_fields = [] - for field in fields: - fk, fv = list(field.items())[0] - if field_key == fk: - matching_fields.append(fv) - - if not matching_fields: - return [] - - subfield_values = [] - for field in matching_fields: - subfields = field.get("subfields") - for subfield in subfields: - sk, sv = list(subfield.items())[0] - if subfield_key == sk or subfield_key is None: - subfield_values.append(sv) - return subfield_values - - -class UcbTindVernacular(MarcVernacular): + + def include_subfield(check_code, subfield_codes, exclude_subfields): + if not subfield_codes: + return True + if exclude_subfields: + return check_code not in subfield_codes + else: + return check_code in subfield_codes + + exclude_subfields = "exclude_subfields" in kwargs and kwargs[ + 'exclude_subfields'] + + matching_fields = [field for field in self.get_marc_fields(field_tags)] + + value_list = [value + for matching_field in matching_fields + for subfield in list(matching_field.subfields_as_dict().items()) + for value in subfield[1] + if include_subfield(subfield[0], subfield_codes, exclude_subfields)] + + return value_list if isinstance(value_list, list) else [] + + def get_marc_fields(self, field_tags: list) -> list: + """ + Get the specified MARC fields from the source_metadata. + + :param field_tags: List of MARC fields to retrieve. + :return: List of MARC fields from the source_metadata. + """ + + return [f for f in self.source_metadata.get("marc").get_fields(*field_tags)] + + +class UcbTindVernacular(OaiVernacular): record_cls = UcbTindRecord def _process_record(self, record_element: list, request_url: str) -> UcbTindRecord: @@ -79,16 +147,17 @@ def _process_record(self, record_element: list, request_url: str) -> UcbTindReco f'{marc_record_string}' '') - record = parse_xml_to_array(StringIO(marc_collection_xml_full))[0].as_dict() - sickle_rec = models.Record(record_element) sickle_header = sickle_rec.header if sickle_header.deleted: return None - record["datestamp"] = sickle_header.datestamp - record["id"] = sickle_header.identifier - record["request_url"] = request_url + record = { + "datestamp": sickle_header.datestamp, + "id": sickle_header.identifier, + "request_url": request_url, + "marc": parse_xml_to_array(StringIO(marc_collection_xml_full))[0] + } return record From 55e5f7339f4278aa28bfed647a82ecedd98c88e9 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Sun, 21 Jan 2024 16:06:51 -0800 Subject: [PATCH 05/19] [S] --- .../mappers/marc/ucb_tind_mapper.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index 9bde4f982..71eb65504 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -16,7 +16,7 @@ def UCLDC_map(self): "language": self.get_marc_values(["041"], ["a"]), "date": self.get_marc_values(["260"], ["c"]), "publisher": self.get_marc_values(["260"], ["a", "b"]), - "format": self.get_marc_values(["337", "338", "340"], ["a"]), + "format": self.map_format, "extent": self.map_extent, "identifier": self.get_marc_values(["020", "022", "035"], ["a"]), "creator": self.get_marc_values(["100", "110", "111"]), @@ -27,6 +27,7 @@ def UCLDC_map(self): "contributor": self.get_marc_values(["700", "710", "711", "720"]), "title": self.map_title, "spatial": self.map_spatial, + "subject": self.map_subject } def get_metadata_fields(self): @@ -38,6 +39,11 @@ def map_spatial(self) -> list: return f651 + self.get_marc_values(self.get_metadata_fields(), ["z"]) + def map_format(self) -> list: + f3xx = self.get_marc_values(["337", "338", "340"], ["a"]), + + return f3xx + self.get_marc_values(self.get_metadata_fields(), ["v"]) + def map_subject(self) -> list: return self.get_marc_values(self.get_metadata_fields()) @@ -72,7 +78,7 @@ def map_extent(self) -> list: """ return self.get_marc_values(["300"]) + self.get_marc_values(["340"], ["b"]) - def map_title(self): + def map_title(self) -> list: # 245, all subfields except c f245 = self.get_marc_values(["245"], ["c"], exclude_subfields=True) @@ -93,7 +99,13 @@ def get_marc_values(self, field_tags: list, subfield_codes=[], **kwargs) -> list :return: A list of values of the specified subfields. """ - def include_subfield(check_code, subfield_codes, exclude_subfields): + def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields: bool) -> bool: + """ + :param check_code: The code to check against the subfield codes. + :param subfield_codes: A list of subfield codes to include / exclude + :param exclude_subfields: A boolean value indicating whether to exclude the specified subfield codes. + :return: A boolean value indicating whether the check_code is included or excluded based on the subfield_codes and exclude_subfields parameters. + """ if not subfield_codes: return True if exclude_subfields: @@ -110,7 +122,7 @@ def include_subfield(check_code, subfield_codes, exclude_subfields): for matching_field in matching_fields for subfield in list(matching_field.subfields_as_dict().items()) for value in subfield[1] - if include_subfield(subfield[0], subfield_codes, exclude_subfields)] + if subfield_matches(subfield[0], subfield_codes, exclude_subfields)] return value_list if isinstance(value_list, list) else [] From 3cfd6827707b680217d71d3b25a38ab4c7de6609 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Mon, 22 Jan 2024 15:18:08 -0800 Subject: [PATCH 06/19] [WIP] Fetch control fields; fetch leader values --- .../mappers/marc/ucb_tind_mapper.py | 253 ++++++++++++++++-- 1 file changed, 224 insertions(+), 29 deletions(-) diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index 71eb65504..ac28283a3 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -6,6 +6,8 @@ from lxml import etree from io import StringIO +from typing import Callable +from collections import OrderedDict class UcbTindRecord(MarcRecord): def UCLDC_map(self): @@ -27,48 +29,104 @@ def UCLDC_map(self): "contributor": self.get_marc_values(["700", "710", "711", "720"]), "title": self.map_title, "spatial": self.map_spatial, - "subject": self.map_subject + "spec_type": self.map_spec_type, + "subject": self.map_subject, + "type": self.map_type, } + def map_type(self): + self.get_marc_leader("type_of_control") + self.get_marc_leader("bibliographic_level") + + print(self.get_marc_control_field("005", 1)) + print(self.get_marc_control_field("001", 3)) + + def map_spec_type(self): + pass + def get_metadata_fields(self): return [str(i) for i in [600, 630, 650, 651] + list(range(610, 620)) + list( range(653, 659)) + list(range(690, 700))] def map_spatial(self) -> list: - f651 = self.get_marc_values(["651"], ["a"]) + f651 = self.get_marc_data_fields(["651"], ["a"]) - return f651 + self.get_marc_values(self.get_metadata_fields(), ["z"]) + return f651 + self.get_marc_data_fields(self.get_metadata_fields(), ["z"]) def map_format(self) -> list: - f3xx = self.get_marc_values(["337", "338", "340"], ["a"]), + f3xx = self.get_marc_data_fields(["337", "338", "340"], ["a"]), - return f3xx + self.get_marc_values(self.get_metadata_fields(), ["v"]) + return f3xx + self.get_marc_data_fields(self.get_metadata_fields(), ["v"]) def map_subject(self) -> list: - return self.get_marc_values(self.get_metadata_fields()) + + def get_delimiters(tag, code): + """ + Returns the appropriate delimiter(s) based on the tag and code + """ + if tag == "658": + if code == "b": + return [":"] + elif code == "c": + return [" [", "]"] + elif code == "d": + return ["--"] + elif ((tag == "653") or (int(tag) in range(690, 700)) or + (code == "b" and + tag in ("654", "655")) or (code in ("v", "x", "y", "z"))): + return ["--"] + elif (tag == "610"): + if code == "b": + return [" "] + else: + return ["--"] + elif code == "d": + return [", "] + + return [". "] + + def split_subject(value, tag, code): + delimiters = get_delimiters(tag, code) + + if not code or code.isdigit(): + # Skip codes that are numeric + return + + value = value.rstrip(", ") + + if value: + delimiters = get_delimiters(tag, code) + for delimiter in delimiters: + values = [delimiter.join(values)] + if delimiter != delimiters[-1]: + # Append an empty value for subsequent joins + values.append("") + + return values + + return self.get_marc_data_fields(self.get_metadata_fields(), process_value=split_subject) def map_temporal(self) -> list: - f648 = self.get_marc_values(["648"]) + f648 = self.get_marc_data_fields(["648"]) - return f648 + self.get_marc_values(self.get_metadata_fields(), ["y"]) + return f648 + self.get_marc_data_fields(self.get_metadata_fields(), ["y"]) def map_format(self) -> list: - return self.get_marc_values(self.get_metadata_fields(), ["v"]) + return self.get_marc_data_fields(self.get_metadata_fields(), ["v"]) def map_description(self) -> list: - field_range = [str(i) for i in range(500, 600) if i != 538] + field_range = [str(i) for i in range(500, 600) if i != 538 and i != 540] - return self.get_marc_values(field_range) + return self.get_marc_data_fields(field_range, ["a"]) def map_relation(self) -> list: field_range = [str(i) for i in range(760, 788)] # Up to 787 - self.get_marc_values(field_range) + self.get_marc_data_fields(field_range) def map_identifier(self) -> list: - f050 = self.get_marc_values(["050"], ["a", "b"]) + f050 = self.get_marc_data_fields(["050"], ["a", "b"]) - return f050 + self.get_marc_values(["020", "022", "035"], ["a"]) + return f050 + self.get_marc_data_fields(["020", "022", "035"], ["a"]) def map_extent(self) -> list: """ @@ -76,35 +134,82 @@ def map_extent(self) -> list: :return: A list of extent values. """ - return self.get_marc_values(["300"]) + self.get_marc_values(["340"], ["b"]) + return self.get_marc_data_fields(["300"]) + self.get_marc_data_fields(["340"], ["b"]) def map_title(self) -> list: # 245, all subfields except c - f245 = self.get_marc_values(["245"], ["c"], exclude_subfields=True) + f245 = self.get_marc_data_fields(["245"], ["c"], exclude_subfields=True) # 242, all subfields - f242 = self.get_marc_values(["242"]) + f242 = self.get_marc_data_fields(["242"]) # 240, all subfields - f240 = self.get_marc_values(["240"]) + f240 = self.get_marc_data_fields(["240"]) return f245 + f242 + f240 - def get_marc_values(self, field_tags: list, subfield_codes=[], **kwargs) -> list: + def get_marc_control_field(self, field_tag: str, index: int = None) -> list: + """ + Get MARC control field. Returns an empty string if: + * Control field isn't set + * No value exists at the requested index + Otherwise it returns a value + + TODO: maybe need to accept slices in addition to ints for the index + + :param field_tag: Field tag to retrieve. + :param index: A specific index to fetch + :return: List of values for the control fields. + """ + + # Don't let any data tags sneak in! They have subfields. + data_field_tag = field_tag if field_tag.isnumeric() and int(field_tag) < 100 else "" + + values = [v[0].value() for (k, v) + in self.get_marc_tag_value_map([data_field_tag]).items() + if len(v) > 0] + + if not values: + return "" + + value = values[0] + + if index and len(value) > index + 1: + return value[index] + + if index: + return "" + + return value + + def get_marc_data_fields(self, field_tags: list, subfield_codes=[], **kwargs) -> list: """ Get the values of specified subfields from given MARC fields. + Set the `exclude_subfields` kwarg to exclude the specified subfield_codes. + + Set the `process_value` kwarg to pass the value through your own code to + do transformations based on the field tag, code and value. See `map_subject` for + an example. + :param field_tags: A list of MARC fields. - :param subfield_codes: A list of subfield codes to filter the values. If empty, all subfields will be included. + :param subfield_codes: A list of subfield codes to filter the values. If empty, + all subfields will be included. :return: A list of values of the specified subfields. """ + # Don't let any control tags sneak in! They don't have subfields. + data_field_tags = [tag for tag in field_tags + if tag.isnumeric() and int(tag) > 99] + def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields: bool) -> bool: """ :param check_code: The code to check against the subfield codes. :param subfield_codes: A list of subfield codes to include / exclude - :param exclude_subfields: A boolean value indicating whether to exclude the specified subfield codes. - :return: A boolean value indicating whether the check_code is included or excluded based on the subfield_codes and exclude_subfields parameters. + :param exclude_subfields: A boolean value indicating whether to exclude the + specified subfield codes. + :return: A boolean value indicating whether the check_code is included or + excluded based on the subfield_codes and exclude_subfields parameters. """ if not subfield_codes: return True @@ -113,12 +218,17 @@ def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields: b else: return check_code in subfield_codes - exclude_subfields = "exclude_subfields" in kwargs and kwargs[ - 'exclude_subfields'] + if "process_value" in kwargs and isinstance(kwargs["process_value"], Callable): + process_value = kwargs["process_value"] + else: + process_value = None - matching_fields = [field for field in self.get_marc_fields(field_tags)] + exclude_subfields = "exclude_subfields" in kwargs and kwargs[ + "exclude_subfields"] - value_list = [value + value_list = [process_value(value, field_tag, subfield[0]) + if process_value else value + for (field_tag, matching_fields) in self.get_marc_tag_value_map(data_field_tags).items() for matching_field in matching_fields for subfield in list(matching_field.subfields_as_dict().items()) for value in subfield[1] @@ -126,16 +236,101 @@ def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields: b return value_list if isinstance(value_list, list) else [] - def get_marc_fields(self, field_tags: list) -> list: + def get_marc_tag_value_map(self, field_tags: list) -> dict: """ - Get the specified MARC fields from the source_metadata. + Get the specified MARC fields from the source_metadata, mapping by field tag :param field_tags: List of MARC fields to retrieve. :return: List of MARC fields from the source_metadata. """ + return {field_tag: self.source_metadata.get("marc").get_fields(field_tag) for + field_tag in field_tags} + + def get_marc_leader(self, leader_key: str): + """ + Retrieve the value of specified leader key from the MARC metadata. - return [f for f in self.source_metadata.get("marc").get_fields(*field_tags)] + Couple things: + * We're not accommodating passing a slice, which pymarc can handle should it be necessary + * Both + :param leader_key: The key of the leader field to retrieve. + :type leader_key: str + :return: The value of the specified leader key. + :rtype: str or None + """ + leader = self.source_metadata.get("marc").leader + + if str(leader_key).isnumeric(): + return leader[int(leader_key)] + + if hasattr(leader, leader_key): + return leader.getattr(leader_key, "") + + return "" + + def get_type_mapping(self): + """ + Legacy code verbatim + :return: + """ + return { + "datafield": OrderedDict( + [("AJ", ("Journal", "Text")), + ("AN", ("Newspaper", "Text")), + ("BI", ("Biography", "Text")), + ("BK", ("Book", "Text")), + ("CF", ("Computer File", "Interactive Resource")), + ("CR", ("CDROM", "Interactive Resource")), + ("CS", ("Software", "Software")), + ("DI", ("Dictionaries", "Text")), + ("DR", ("Directories", "Text")), + ("EN", ("Encyclopedias", "Text")), + ("HT", ("HathiTrust", None)), + ("MN", ("Maps-Atlas", "Image")), + ("MP", ("Map", "Image")), + ("MS", ("Musical Score", "Text")), + ("MU", ("Music", "Text")), + ("MV", ("Archive", "Collection")), + ("MW", ("Manuscript", "Text")), + ("MX", ("Mixed Material", "Collection")), + ("PP", ("Photograph/Pictorial Works", "Image")), + ("RC", ("Audio CD", "Sound")), + ("RL", ("Audio LP", "Sound")), + ("RM", ("Music", "Sound")), + ("RS", ("Spoken word", "Sound")), + ("RU", (None, "Sound")), + ("SE", ("Serial", "Text")), + ("SX", ("Serial", "Text")), + ("VB", ("Video (Blu-ray)", "Moving Image")), + ("VD", ("Video (DVD)", "Moving Image")), + ("VG", ("Video Games", "Moving Image")), + ("VH", ("Video (VHS)", "Moving Image")), + ("VL", ("Motion Picture", "Moving Image")), + ("VM", ("Visual Material", "Image")), + ("WM", ("Microform", "Text")), + ("XC", ("Conference", "Text")), + ("XS", ("Statistics", "Text"))]), + "leader": OrderedDict( + [("am", ("Book", "Text")), + ("asn", ("Newspapers", "Text")), + ("as", ("Serial", "Text")), + ("aa", ("Book", "Text")), + ("a(?![mcs])", ("Serial", "Text")), + ("[cd].*", ("Musical Score", "Text")), + ("t.*", ("Manuscript", "Text")), + ("[ef].*", ("Maps", "Image")), + ("g.[st]", ("Photograph/Pictorial Works", "Image")), + ("g.[cdfo]", ("Film/Video", "Moving Image")), + ("g.*", (None, "Image")), + ("k.*", ("Photograph/Pictorial Works", "Image")), + ("i.*", ("Nonmusic", "Sound")), + ("j.*", ("Music", "Sound")), + ("r.*", (None, "Physical object")), + ("p[cs].*", (None, "Collection")), + ("m.*", (None, "Interactive Resource")), + ("o.*", (None, "Collection"))]) + } class UcbTindVernacular(OaiVernacular): record_cls = UcbTindRecord From 1357c8533bfca3607c8c9fcec18407010bc6227c Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Mon, 29 Jan 2024 08:25:45 -0800 Subject: [PATCH 07/19] [WIP] --- .../mappers/marc/ucb_tind_mapper.py | 123 ++++++++++++------ 1 file changed, 83 insertions(+), 40 deletions(-) diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index ac28283a3..f072ba4f6 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -9,24 +9,26 @@ from typing import Callable from collections import OrderedDict +import re + class UcbTindRecord(MarcRecord): def UCLDC_map(self): return { "calisphere-id": self.legacy_couch_db_id.split("--")[1], - "isShownAt": self.get_marc_values(["856"], ["u"]), - "isShownBy": self.get_marc_values(["856"], ["u"]), - "language": self.get_marc_values(["041"], ["a"]), - "date": self.get_marc_values(["260"], ["c"]), - "publisher": self.get_marc_values(["260"], ["a", "b"]), + "isShownAt": self.get_marc_data_fields(["856"], ["u"]), + "isShownBy": self.get_marc_data_fields(["856"], ["u"]), + "language": self.get_marc_data_fields(["041"], ["a"]), + "date": self.get_marc_data_fields(["260"], ["c"]), + "publisher": self.get_marc_data_fields(["260"], ["a", "b"]), "format": self.map_format, "extent": self.map_extent, - "identifier": self.get_marc_values(["020", "022", "035"], ["a"]), - "creator": self.get_marc_values(["100", "110", "111"]), + "identifier": self.get_marc_data_fields(["020", "022", "035"], ["a"]), + "creator": self.get_marc_data_fields(["100", "110", "111"]), "relation": self.map_relation, "description": self.map_description, - "rights": self.get_marc_values(["506", "540"]), - "temporal": self.get_marc_values(["648"]), - "contributor": self.get_marc_values(["700", "710", "711", "720"]), + "rights": self.get_marc_data_fields(["506", "540"]), + "temporal": self.get_marc_data_fields(["648"]), + "contributor": self.get_marc_data_fields(["700", "710", "711", "720"]), "title": self.map_title, "spatial": self.map_spatial, "spec_type": self.map_spec_type, @@ -35,15 +37,49 @@ def UCLDC_map(self): } def map_type(self): - self.get_marc_leader("type_of_control") + self.get_marc_leader("bibliographic_level") + value = [] + + for type_mapping in self.get_type_mapping(): + value.append(type_mapping[1]) + + return value - print(self.get_marc_control_field("005", 1)) - print(self.get_marc_control_field("001", 3)) def map_spec_type(self): - pass + value = [] + + for type_mapping in self.get_type_mapping(): + value.append(type_mapping[0]) + + if (self.get_marc_control_field("008", 28) + in ("a", "c", "f", "i", "l", "m", "o", "s") or + self.get_marc_data_fields(["086", "087"])): + value.append("Government Document") + + return value + + def get_type_mapping(self): + type_mapping = [] + + compare = (self.get_marc_leader("type_of_control") + + self.get_marc_leader("bibliographic_level") + + self.get_marc_control_field("007", 1) + + self.get_marc_control_field("008", 21)) + + for (key, value) in self.type_mapping["leader"]: + if re.match(f"^{key}", compare): + type_mapping.append(value) + + return type_mapping def get_metadata_fields(self): + """ + Returns a list of metadata fields used by map_format, map_subject, + map_temporal, map_format + + :return: A list of fields + :rtype: list + """ return [str(i) for i in [600, 630, 650, 651] + list(range(610, 620)) + list( range(653, 659)) + list(range(690, 700))] @@ -87,14 +123,13 @@ def get_delimiters(tag, code): def split_subject(value, tag, code): delimiters = get_delimiters(tag, code) + # Skip codes that are numeric if not code or code.isdigit(): - # Skip codes that are numeric return value = value.rstrip(", ") if value: - delimiters = get_delimiters(tag, code) for delimiter in delimiters: values = [delimiter.join(values)] if delimiter != delimiters[-1]: @@ -103,7 +138,8 @@ def split_subject(value, tag, code): return values - return self.get_marc_data_fields(self.get_metadata_fields(), process_value=split_subject) + return self.get_marc_data_fields(self.get_metadata_fields(), + process_value=split_subject) def map_temporal(self) -> list: f648 = self.get_marc_data_fields(["648"]) @@ -134,7 +170,8 @@ def map_extent(self) -> list: :return: A list of extent values. """ - return self.get_marc_data_fields(["300"]) + self.get_marc_data_fields(["340"], ["b"]) + return self.get_marc_data_fields(["300"]) + self.get_marc_data_fields(["340"], + ["b"]) def map_title(self) -> list: # 245, all subfields except c @@ -163,7 +200,8 @@ def get_marc_control_field(self, field_tag: str, index: int = None) -> list: """ # Don't let any data tags sneak in! They have subfields. - data_field_tag = field_tag if field_tag.isnumeric() and int(field_tag) < 100 else "" + data_field_tag = field_tag if field_tag.isnumeric() and int( + field_tag) < 100 else "" values = [v[0].value() for (k, v) in self.get_marc_tag_value_map([data_field_tag]).items() @@ -182,7 +220,8 @@ def get_marc_control_field(self, field_tag: str, index: int = None) -> list: return value - def get_marc_data_fields(self, field_tags: list, subfield_codes=[], **kwargs) -> list: + def get_marc_data_fields(self, field_tags: list, subfield_codes=[], + **kwargs) -> list: """ Get the values of specified subfields from given MARC fields. @@ -202,7 +241,8 @@ def get_marc_data_fields(self, field_tags: list, subfield_codes=[], **kwargs) -> data_field_tags = [tag for tag in field_tags if tag.isnumeric() and int(tag) > 99] - def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields: bool) -> bool: + def subfield_matches(check_code: str, subfield_codes: list, + exclude_subfields: bool) -> bool: """ :param check_code: The code to check against the subfield codes. :param subfield_codes: A list of subfield codes to include / exclude @@ -228,11 +268,13 @@ def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields: b value_list = [process_value(value, field_tag, subfield[0]) if process_value else value - for (field_tag, matching_fields) in self.get_marc_tag_value_map(data_field_tags).items() + for (field_tag, matching_fields) in + self.get_marc_tag_value_map(data_field_tags).items() for matching_field in matching_fields for subfield in list(matching_field.subfields_as_dict().items()) for value in subfield[1] - if subfield_matches(subfield[0], subfield_codes, exclude_subfields)] + if + subfield_matches(subfield[0], subfield_codes, exclude_subfields)] return value_list if isinstance(value_list, list) else [] @@ -312,26 +354,27 @@ def get_type_mapping(self): ("XC", ("Conference", "Text")), ("XS", ("Statistics", "Text"))]), "leader": OrderedDict( - [("am", ("Book", "Text")), - ("asn", ("Newspapers", "Text")), - ("as", ("Serial", "Text")), - ("aa", ("Book", "Text")), + [("am", ("Book", "Text")), + ("asn", ("Newspapers", "Text")), + ("as", ("Serial", "Text")), + ("aa", ("Book", "Text")), ("a(?![mcs])", ("Serial", "Text")), - ("[cd].*", ("Musical Score", "Text")), - ("t.*", ("Manuscript", "Text")), - ("[ef].*", ("Maps", "Image")), - ("g.[st]", ("Photograph/Pictorial Works", "Image")), - ("g.[cdfo]", ("Film/Video", "Moving Image")), - ("g.*", (None, "Image")), - ("k.*", ("Photograph/Pictorial Works", "Image")), - ("i.*", ("Nonmusic", "Sound")), - ("j.*", ("Music", "Sound")), - ("r.*", (None, "Physical object")), - ("p[cs].*", (None, "Collection")), - ("m.*", (None, "Interactive Resource")), - ("o.*", (None, "Collection"))]) + ("[cd].*", ("Musical Score", "Text")), + ("t.*", ("Manuscript", "Text")), + ("[ef].*", ("Maps", "Image")), + ("g.[st]", ("Photograph/Pictorial Works", "Image")), + ("g.[cdfo]", ("Film/Video", "Moving Image")), + ("g.*", (None, "Image")), + ("k.*", ("Photograph/Pictorial Works", "Image")), + ("i.*", ("Nonmusic", "Sound")), + ("j.*", ("Music", "Sound")), + ("r.*", (None, "Physical object")), + ("p[cs].*", (None, "Collection")), + ("m.*", (None, "Interactive Resource")), + ("o.*", (None, "Collection"))]) } + class UcbTindVernacular(OaiVernacular): record_cls = UcbTindRecord From 7d188407c9bace5ccf0d6c3096611b8873c2a5a5 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Mon, 29 Jan 2024 11:21:29 -0800 Subject: [PATCH 08/19] [WIP] --- metadata_fetcher/fetchers/Fetcher.py | 8 ++++- metadata_fetcher/lambda_function.py | 5 ++-- .../mappers/marc/ucb_tind_mapper.py | 30 ++++++++----------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index ebaa75f88..34dd7e68a 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -3,7 +3,8 @@ from requests.adapters import HTTPAdapter, Retry from rikolti.utils.versions import put_vernacular_page - +import time +import os logger = logging.getLogger(__name__) @@ -52,6 +53,11 @@ def fetch_page(self): f"[{self.collection_id}]: fetching page {self.write_page} " f"at {page.get('url')}" ) + + # Added because collection 28011 was failing without this + print(f"Sleeping in {os.path.basename(__file__)}!") + time.sleep(1) + print("Done Sleeping!") try: response = requests.get(**page) response.raise_for_status() diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index 1eff42c89..ef541dc1c 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -3,6 +3,7 @@ import logging import sys import time +import os from .fetchers.Fetcher import Fetcher from rikolti.utils.versions import create_vernacular_version @@ -22,7 +23,7 @@ def import_fetcher(harvest_type): # AWS Lambda entry point -def fetch_collection(payload, vernacular_version, context, sleep=False) -> list[dict]: +def fetch_collection(payload, vernacular_version, sleep=True) -> list[dict]: """ returns a list of dicts with the following keys: document_count: int @@ -41,7 +42,7 @@ def fetch_collection(payload, vernacular_version, context, sleep=False) -> list[ fetch_status = [] try: if sleep: - print("Sleeping!") + print(f"Sleeping in {os.path.basename(__file__)}!") time.sleep(1) print("Done Sleeping!") fetcher = fetcher_class(payload) diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index f072ba4f6..3209ee42f 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -15,6 +15,7 @@ class UcbTindRecord(MarcRecord): def UCLDC_map(self): return { "calisphere-id": self.legacy_couch_db_id.split("--")[1], + "_id": self.get_marc_data_fields(["901"], ["a"]), "isShownAt": self.get_marc_data_fields(["856"], ["u"]), "isShownBy": self.get_marc_data_fields(["856"], ["u"]), "language": self.get_marc_data_fields(["041"], ["a"]), @@ -22,7 +23,8 @@ def UCLDC_map(self): "publisher": self.get_marc_data_fields(["260"], ["a", "b"]), "format": self.map_format, "extent": self.map_extent, - "identifier": self.get_marc_data_fields(["020", "022", "035"], ["a"]), + "identifier": self.get_marc_data_fields(["020", "022", "024", "901"], + ["a"]), "creator": self.get_marc_data_fields(["100", "110", "111"]), "relation": self.map_relation, "description": self.map_description, @@ -38,18 +40,16 @@ def UCLDC_map(self): def map_type(self): value = [] - - for type_mapping in self.get_type_mapping(): - value.append(type_mapping[1]) + for types in self.get_matching_types(): + value.append(types[1]) return value - def map_spec_type(self): value = [] - for type_mapping in self.get_type_mapping(): - value.append(type_mapping[0]) + for types in self.get_matching_types(): + value.append(types[0]) if (self.get_marc_control_field("008", 28) in ("a", "c", "f", "i", "l", "m", "o", "s") or @@ -58,15 +58,14 @@ def map_spec_type(self): return value - def get_type_mapping(self): + def get_matching_types(self): type_mapping = [] - compare = (self.get_marc_leader("type_of_control") + self.get_marc_leader("bibliographic_level") + self.get_marc_control_field("007", 1) + self.get_marc_control_field("008", 21)) - for (key, value) in self.type_mapping["leader"]: + for (key, value) in self.get_types()["leader"].items(): if re.match(f"^{key}", compare): type_mapping.append(value) @@ -223,7 +222,7 @@ def get_marc_control_field(self, field_tag: str, index: int = None) -> list: def get_marc_data_fields(self, field_tags: list, subfield_codes=[], **kwargs) -> list: """ - Get the values of specified subfields from given MARC fields. + Get the values of specified subfields from given MARC fields. This allows control fields too. Set the `exclude_subfields` kwarg to exclude the specified subfield_codes. @@ -236,11 +235,6 @@ def get_marc_data_fields(self, field_tags: list, subfield_codes=[], all subfields will be included. :return: A list of values of the specified subfields. """ - - # Don't let any control tags sneak in! They don't have subfields. - data_field_tags = [tag for tag in field_tags - if tag.isnumeric() and int(tag) > 99] - def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields: bool) -> bool: """ @@ -269,7 +263,7 @@ def subfield_matches(check_code: str, subfield_codes: list, value_list = [process_value(value, field_tag, subfield[0]) if process_value else value for (field_tag, matching_fields) in - self.get_marc_tag_value_map(data_field_tags).items() + self.get_marc_tag_value_map(field_tags).items() for matching_field in matching_fields for subfield in list(matching_field.subfields_as_dict().items()) for value in subfield[1] @@ -311,7 +305,7 @@ def get_marc_leader(self, leader_key: str): return "" - def get_type_mapping(self): + def get_types(self): """ Legacy code verbatim :return: From fbfba8711a8b61721d55d3eedbce7c0185fe8145 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Fri, 2 Feb 2024 12:01:41 -0800 Subject: [PATCH 09/19] [WIP] Collection 27649 working on the whole --- metadata_mapper/mappers/marc/marc_mapper.py | 135 +++++++++- .../mappers/marc/ucb_tind_mapper.py | 234 ++++-------------- .../mappers/oai/cca_vault_mapper.py | 2 +- metadata_mapper/validator/validator.py | 13 + 4 files changed, 192 insertions(+), 192 deletions(-) diff --git a/metadata_mapper/mappers/marc/marc_mapper.py b/metadata_mapper/mappers/marc/marc_mapper.py index e4cbe4500..49d811a53 100644 --- a/metadata_mapper/mappers/marc/marc_mapper.py +++ b/metadata_mapper/mappers/marc/marc_mapper.py @@ -1,18 +1,143 @@ from ..oai.oai_mapper import OaiVernacular from ..mapper import Record -from lxml import etree -from sickle import models - -from pprint import pprint +from typing import Callable class MarcRecord(Record): def UCLDC_map(self): - # pprint(self.source_metadata) return { } + def map_type(self): + value = [] + for types in self.get_matching_types(): + value.append(types[1]) + + return value + + def get_marc_control_field(self, field_tag: str, index: int = None) -> list: + """ + Get MARC control field. Returns an empty string if: + * Control field isn't set + * No value exists at the requested index + Otherwise it returns a value + + TODO: maybe need to accept slices in addition to ints for the index + + :param field_tag: Field tag to retrieve. + :param index: A specific index to fetch + :return: List of values for the control fields. + """ + + # Don't let any data tags sneak in! They have subfields. + data_field_tag = field_tag if field_tag.isnumeric() and int( + field_tag) < 100 else "" + + values = [v[0].value() for (k, v) + in self.get_marc_tag_value_map([data_field_tag]).items() + if len(v) > 0] + + if not values: + return "" + + value = values[0] + + if index and len(value) > index + 1: + return value[index] + + if index: + return "" + + return value + + def get_marc_data_fields(self, field_tags: list, subfield_codes=[], + **kwargs) -> list: + """ + Get the values of specified subfields from given MARC fields. This allows control fields too. + + Set the `exclude_subfields` kwarg to exclude the specified subfield_codes. + + Set the `process_value` kwarg to pass the value through your own code to + do transformations based on the field tag, code and value. See `map_subject` for + an example. + + :param field_tags: A list of MARC fields. + :param subfield_codes: A list of subfield codes to filter the values. If empty, + all subfields will be included. + :return: A list of values of the specified subfields. + """ + def subfield_matches(check_code: str, subfield_codes: list, + exclude_subfields: bool) -> bool: + """ + :param check_code: The code to check against the subfield codes. + :param subfield_codes: A list of subfield codes to include / exclude + :param exclude_subfields: A boolean value indicating whether to exclude the + specified subfield codes. + :return: A boolean value indicating whether the check_code is included or + excluded based on the subfield_codes and exclude_subfields parameters. + """ + if not subfield_codes: + return True + if exclude_subfields: + return check_code not in subfield_codes + else: + return check_code in subfield_codes + + if "process_value" in kwargs and isinstance(kwargs["process_value"], Callable): + process_value = kwargs["process_value"] + else: + process_value = None + + exclude_subfields = "exclude_subfields" in kwargs and kwargs[ + "exclude_subfields"] + + value_list = [process_value(value, field_tag, subfield[0]) + if process_value else value + for (field_tag, matching_fields) in + self.get_marc_tag_value_map(field_tags).items() + for matching_field in matching_fields + for subfield in list(matching_field.subfields_as_dict().items()) + for value in subfield[1] + if + subfield_matches(subfield[0], subfield_codes, exclude_subfields)] + + return value_list if isinstance(value_list, list) else [] + + def get_marc_tag_value_map(self, field_tags: list) -> dict: + """ + Get the specified MARC fields from the source_metadata, mapping by field tag + + :param field_tags: List of MARC fields to retrieve. + :return: List of MARC fields from the source_metadata. + """ + return {field_tag: self.source_metadata.get("marc").get_fields(field_tag) for + field_tag in field_tags} + + def get_marc_leader(self, leader_key: str): + """ + Retrieve the value of specified leader key from the MARC metadata. + + Couple things: + * We're not accommodating passing a slice, which pymarc can handle should it be necessary + * Both + + :param leader_key: The key of the leader field to retrieve. + :type leader_key: str + :return: The value of the specified leader key. + :rtype: str or None + """ + leader = self.source_metadata.get("marc").leader + + if str(leader_key).isnumeric(): + return leader[int(leader_key)] + + if hasattr(leader, leader_key): + return leader.getattr(leader_key, "") + + return "" + + class MarcVernacular(OaiVernacular): pass diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index 3209ee42f..95a1dccd4 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -6,44 +6,54 @@ from lxml import etree from io import StringIO -from typing import Callable +from ..mapper import Validator + from collections import OrderedDict import re + class UcbTindRecord(MarcRecord): def UCLDC_map(self): return { "calisphere-id": self.legacy_couch_db_id.split("--")[1], "_id": self.get_marc_data_fields(["901"], ["a"]), - "isShownAt": self.get_marc_data_fields(["856"], ["u"]), - "isShownBy": self.get_marc_data_fields(["856"], ["u"]), + "isShownAt": self.map_is_shown_at, + "isShownBy": self.map_is_shown_by, + "alt_title": self.get_marc_data_fields(["246"], ["6"], + exclude_subfields=True), "language": self.get_marc_data_fields(["041"], ["a"]), "date": self.get_marc_data_fields(["260"], ["c"]), "publisher": self.get_marc_data_fields(["260"], ["a", "b"]), - "format": self.map_format, + "format": self.get_marc_data_fields(["655"], ["2"], + exclude_subfields=True), "extent": self.map_extent, - "identifier": self.get_marc_data_fields(["020", "022", "024", "901"], + "identifier": self.get_marc_data_fields(["024", "901", "035"], ["a"]), - "creator": self.get_marc_data_fields(["100", "110", "111"]), + "contributor": self.get_marc_data_fields(["100", "110", "111"]), + "creator": self.get_marc_data_fields(["700"], ["a"]), "relation": self.map_relation, + "provenance": self.get_marc_data_fields(["541"], ["a"]), "description": self.map_description, "rights": self.get_marc_data_fields(["506", "540"]), "temporal": self.get_marc_data_fields(["648"]), - "contributor": self.get_marc_data_fields(["700", "710", "711", "720"]), "title": self.map_title, "spatial": self.map_spatial, "spec_type": self.map_spec_type, "subject": self.map_subject, - "type": self.map_type, + "type": self.get_marc_data_fields(["336"]) } - def map_type(self): - value = [] - for types in self.get_matching_types(): - value.append(types[1]) + def map_is_shown_at(self): + field_001 = self.get_marc_control_field("001") + if field_001: + return "https://digicoll.lib.berkeley.edu/record/" + field_001 - return value + def map_is_shown_by(self): + field_001 = self.get_marc_control_field("001") + if field_001: + return ("https://digicoll.lib.berkeley.edu/nanna/thumbnail/v2/" + + field_001 + "?redirect=1") def map_spec_type(self): value = [] @@ -87,58 +97,9 @@ def map_spatial(self) -> list: return f651 + self.get_marc_data_fields(self.get_metadata_fields(), ["z"]) - def map_format(self) -> list: - f3xx = self.get_marc_data_fields(["337", "338", "340"], ["a"]), - - return f3xx + self.get_marc_data_fields(self.get_metadata_fields(), ["v"]) - def map_subject(self) -> list: - - def get_delimiters(tag, code): - """ - Returns the appropriate delimiter(s) based on the tag and code - """ - if tag == "658": - if code == "b": - return [":"] - elif code == "c": - return [" [", "]"] - elif code == "d": - return ["--"] - elif ((tag == "653") or (int(tag) in range(690, 700)) or - (code == "b" and - tag in ("654", "655")) or (code in ("v", "x", "y", "z"))): - return ["--"] - elif (tag == "610"): - if code == "b": - return [" "] - else: - return ["--"] - elif code == "d": - return [", "] - - return [". "] - - def split_subject(value, tag, code): - delimiters = get_delimiters(tag, code) - - # Skip codes that are numeric - if not code or code.isdigit(): - return - - value = value.rstrip(", ") - - if value: - for delimiter in delimiters: - values = [delimiter.join(values)] - if delimiter != delimiters[-1]: - # Append an empty value for subsequent joins - values.append("") - - return values - - return self.get_marc_data_fields(self.get_metadata_fields(), - process_value=split_subject) + fields = self.get_metadata_fields() + return [{"name": s} for s in self.get_marc_data_fields(fields, ["2"], exclude_subfields=True)] def map_temporal(self) -> list: f648 = self.get_marc_data_fields(["648"]) @@ -169,8 +130,8 @@ def map_extent(self) -> list: :return: A list of extent values. """ - return self.get_marc_data_fields(["300"]) + self.get_marc_data_fields(["340"], - ["b"]) + return [", ".join(self.get_marc_data_fields(["300"]) + self.get_marc_data_fields(["340"], + ["b"]))] def map_title(self) -> list: # 245, all subfields except c @@ -184,126 +145,6 @@ def map_title(self) -> list: return f245 + f242 + f240 - def get_marc_control_field(self, field_tag: str, index: int = None) -> list: - """ - Get MARC control field. Returns an empty string if: - * Control field isn't set - * No value exists at the requested index - Otherwise it returns a value - - TODO: maybe need to accept slices in addition to ints for the index - - :param field_tag: Field tag to retrieve. - :param index: A specific index to fetch - :return: List of values for the control fields. - """ - - # Don't let any data tags sneak in! They have subfields. - data_field_tag = field_tag if field_tag.isnumeric() and int( - field_tag) < 100 else "" - - values = [v[0].value() for (k, v) - in self.get_marc_tag_value_map([data_field_tag]).items() - if len(v) > 0] - - if not values: - return "" - - value = values[0] - - if index and len(value) > index + 1: - return value[index] - - if index: - return "" - - return value - - def get_marc_data_fields(self, field_tags: list, subfield_codes=[], - **kwargs) -> list: - """ - Get the values of specified subfields from given MARC fields. This allows control fields too. - - Set the `exclude_subfields` kwarg to exclude the specified subfield_codes. - - Set the `process_value` kwarg to pass the value through your own code to - do transformations based on the field tag, code and value. See `map_subject` for - an example. - - :param field_tags: A list of MARC fields. - :param subfield_codes: A list of subfield codes to filter the values. If empty, - all subfields will be included. - :return: A list of values of the specified subfields. - """ - def subfield_matches(check_code: str, subfield_codes: list, - exclude_subfields: bool) -> bool: - """ - :param check_code: The code to check against the subfield codes. - :param subfield_codes: A list of subfield codes to include / exclude - :param exclude_subfields: A boolean value indicating whether to exclude the - specified subfield codes. - :return: A boolean value indicating whether the check_code is included or - excluded based on the subfield_codes and exclude_subfields parameters. - """ - if not subfield_codes: - return True - if exclude_subfields: - return check_code not in subfield_codes - else: - return check_code in subfield_codes - - if "process_value" in kwargs and isinstance(kwargs["process_value"], Callable): - process_value = kwargs["process_value"] - else: - process_value = None - - exclude_subfields = "exclude_subfields" in kwargs and kwargs[ - "exclude_subfields"] - - value_list = [process_value(value, field_tag, subfield[0]) - if process_value else value - for (field_tag, matching_fields) in - self.get_marc_tag_value_map(field_tags).items() - for matching_field in matching_fields - for subfield in list(matching_field.subfields_as_dict().items()) - for value in subfield[1] - if - subfield_matches(subfield[0], subfield_codes, exclude_subfields)] - - return value_list if isinstance(value_list, list) else [] - - def get_marc_tag_value_map(self, field_tags: list) -> dict: - """ - Get the specified MARC fields from the source_metadata, mapping by field tag - - :param field_tags: List of MARC fields to retrieve. - :return: List of MARC fields from the source_metadata. - """ - return {field_tag: self.source_metadata.get("marc").get_fields(field_tag) for - field_tag in field_tags} - - def get_marc_leader(self, leader_key: str): - """ - Retrieve the value of specified leader key from the MARC metadata. - - Couple things: - * We're not accommodating passing a slice, which pymarc can handle should it be necessary - * Both - - :param leader_key: The key of the leader field to retrieve. - :type leader_key: str - :return: The value of the specified leader key. - :rtype: str or None - """ - leader = self.source_metadata.get("marc").leader - - if str(leader_key).isnumeric(): - return leader[int(leader_key)] - - if hasattr(leader, leader_key): - return leader.getattr(leader_key, "") - - return "" def get_types(self): """ @@ -369,8 +210,29 @@ def get_types(self): } +class UcbTindValidator(Validator): + + def setup(self): + self.add_validatable_fields([ + { + "field": "is_shown_by", + "validations": [ + Validator.str_match_ignore_url_protocol, + Validator.verify_type(str) + ] + }, + { + "field": "is_shown_at", + "validations": [ + Validator.str_match_ignore_url_protocol, + Validator.verify_type(str) + ] + } + ]) + class UcbTindVernacular(OaiVernacular): record_cls = UcbTindRecord + validator = UcbTindValidator def _process_record(self, record_element: list, request_url: str) -> UcbTindRecord: """ diff --git a/metadata_mapper/mappers/oai/cca_vault_mapper.py b/metadata_mapper/mappers/oai/cca_vault_mapper.py index c39a3e9ce..00f28d2b5 100644 --- a/metadata_mapper/mappers/oai/cca_vault_mapper.py +++ b/metadata_mapper/mappers/oai/cca_vault_mapper.py @@ -43,7 +43,7 @@ def setup(self): { "field": "is_shown_by", "validations": [ - CcaVaultValidator.str_match_ignore_url_protocol, + Validator.str_match_ignore_url_protocol, Validator.verify_type(str) ] }, diff --git a/metadata_mapper/validator/validator.py b/metadata_mapper/validator/validator.py index 07ad765a5..688fc151d 100644 --- a/metadata_mapper/validator/validator.py +++ b/metadata_mapper/validator/validator.py @@ -453,6 +453,19 @@ def dict_of(*types: list[type]) -> Callable: lam.__name__ = f"Dictionary of {', '.join([t.__name__ for t in types])}" return lam + @staticmethod + def str_match_ignore_url_protocol(validation_def: dict, + rikolti_value: Any, + comparison_value: Any) -> None: + if rikolti_value == comparison_value: + return + + if comparison_value and comparison_value.startswith('http'): + comparison_value = comparison_value.replace('http', 'https') + + if not rikolti_value == comparison_value: + return "Content mismatch" + # Private def _perform_validations(self, validation_def: dict[str, Any]) -> None: From 68e81d2574742ad476209926872f09ada906b104 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Sun, 4 Feb 2024 11:45:16 -0800 Subject: [PATCH 10/19] [WIP] Get field 880 mostly working --- metadata_mapper/mappers/marc/marc_mapper.py | 81 ++++++++++--- .../mappers/marc/ucb_tind_mapper.py | 108 ++---------------- 2 files changed, 75 insertions(+), 114 deletions(-) diff --git a/metadata_mapper/mappers/marc/marc_mapper.py b/metadata_mapper/mappers/marc/marc_mapper.py index 49d811a53..fafc0f54a 100644 --- a/metadata_mapper/mappers/marc/marc_mapper.py +++ b/metadata_mapper/mappers/marc/marc_mapper.py @@ -2,20 +2,14 @@ from ..mapper import Record from typing import Callable +import re +from itertools import chain class MarcRecord(Record): def UCLDC_map(self): return { - } - def map_type(self): - value = [] - for types in self.get_matching_types(): - value.append(types[1]) - - return value - def get_marc_control_field(self, field_tag: str, index: int = None) -> list: """ Get MARC control field. Returns an empty string if: @@ -23,8 +17,6 @@ def get_marc_control_field(self, field_tag: str, index: int = None) -> list: * No value exists at the requested index Otherwise it returns a value - TODO: maybe need to accept slices in addition to ints for the index - :param field_tag: Field tag to retrieve. :param index: A specific index to fetch :return: List of values for the control fields. @@ -51,10 +43,14 @@ def get_marc_control_field(self, field_tag: str, index: int = None) -> list: return value - def get_marc_data_fields(self, field_tags: list, subfield_codes=[], + def get_marc_data_fields(self, field_tags: list, subfield_codes=[], recurse=True, **kwargs) -> list: """ - Get the values of specified subfields from given MARC fields. This allows control fields too. + TODO: Variable name meaning becomes quite fuzzy in the heart of this + function. Most variables could stand to be renamed. + + Get the values of specified subfields from given MARC fields. This allows + control fields too. Set the `exclude_subfields` kwarg to exclude the specified subfield_codes. @@ -62,6 +58,12 @@ def get_marc_data_fields(self, field_tags: list, subfield_codes=[], do transformations based on the field tag, code and value. See `map_subject` for an example. + :param recurse: Indicates whether alternate graphic representations (field 880) + should be sought. This is used here to prevent infinite loops + when this function is called to get field 880. It would also be + possible (and maybe preferable) to remove this argument and set + a `recurse` variable to false if "880" is included among + `field_tags`. :param field_tags: A list of MARC fields. :param subfield_codes: A list of subfield codes to filter the values. If empty, all subfields will be included. @@ -77,6 +79,10 @@ def subfield_matches(check_code: str, subfield_codes: list, :return: A boolean value indicating whether the check_code is included or excluded based on the subfield_codes and exclude_subfields parameters. """ + + # Always exclude subfield 6 unless it is explicitly listed + if check_code == "6" and "6" not in subfield_codes: + return False if not subfield_codes: return True if exclude_subfields: @@ -84,6 +90,41 @@ def subfield_matches(check_code: str, subfield_codes: list, else: return check_code in subfield_codes + def get_alternate_graphic_representation(tag: str, code: str, index: int, + recurse=True) -> list: + """ + This is where field 880 is handled + :param tag: + :param code: + :param index: + :param recurse: + :return: + """ + if not recurse: + return [] + + subfield_6 = self.get_marc_data_fields([tag], ["6"], False) + if not subfield_6 or index >= len(subfield_6): + return [] + + match = re.match(r"^880\-([0-9]+)$", subfield_6[index]) + if not match: + return [] + + all_880 = self.get_marc_tag_value_map(["880"])["880"] + index_880 = int(match.group(1)) - 1 # 880 indices start at 1 + + if not all_880 or index_880 >= len(all_880): + return [] + + field = all_880[index_880] + subfields = field.subfields_as_dict() + + if code not in subfields: + return [] + + return subfields[code] + if "process_value" in kwargs and isinstance(kwargs["process_value"], Callable): process_value = kwargs["process_value"] else: @@ -92,17 +133,24 @@ def subfield_matches(check_code: str, subfield_codes: list, exclude_subfields = "exclude_subfields" in kwargs and kwargs[ "exclude_subfields"] - value_list = [process_value(value, field_tag, subfield[0]) - if process_value else value + value_list = [[(process_value(value, field_tag, subfield[0]) + if process_value else value)] + + get_alternate_graphic_representation(field_tag, subfield[0], field_index, recurse) for (field_tag, matching_fields) in self.get_marc_tag_value_map(field_tags).items() - for matching_field in matching_fields + for field_index, matching_field in enumerate(matching_fields) for subfield in list(matching_field.subfields_as_dict().items()) for value in subfield[1] if subfield_matches(subfield[0], subfield_codes, exclude_subfields)] - return value_list if isinstance(value_list, list) else [] + values = list(chain.from_iterable(value_list)) if isinstance(value_list, list) else [] + + deduped_values = [] + [deduped_values.append(value) for value in values + if value not in deduped_values] + + return deduped_values def get_marc_tag_value_map(self, field_tags: list) -> dict: """ @@ -138,6 +186,5 @@ def get_marc_leader(self, leader_key: str): return "" - class MarcVernacular(OaiVernacular): pass diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index 95a1dccd4..ae25e1211 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -20,8 +20,7 @@ def UCLDC_map(self): "_id": self.get_marc_data_fields(["901"], ["a"]), "isShownAt": self.map_is_shown_at, "isShownBy": self.map_is_shown_by, - "alt_title": self.get_marc_data_fields(["246"], ["6"], - exclude_subfields=True), + "alternativeTitle": self.get_marc_data_fields(["246"]), "language": self.get_marc_data_fields(["041"], ["a"]), "date": self.get_marc_data_fields(["260"], ["c"]), "publisher": self.get_marc_data_fields(["260"], ["a", "b"]), @@ -39,7 +38,6 @@ def UCLDC_map(self): "temporal": self.get_marc_data_fields(["648"]), "title": self.map_title, "spatial": self.map_spatial, - "spec_type": self.map_spec_type, "subject": self.map_subject, "type": self.get_marc_data_fields(["336"]) } @@ -55,32 +53,6 @@ def map_is_shown_by(self): return ("https://digicoll.lib.berkeley.edu/nanna/thumbnail/v2/" + field_001 + "?redirect=1") - def map_spec_type(self): - value = [] - - for types in self.get_matching_types(): - value.append(types[0]) - - if (self.get_marc_control_field("008", 28) - in ("a", "c", "f", "i", "l", "m", "o", "s") or - self.get_marc_data_fields(["086", "087"])): - value.append("Government Document") - - return value - - def get_matching_types(self): - type_mapping = [] - compare = (self.get_marc_leader("type_of_control") + - self.get_marc_leader("bibliographic_level") + - self.get_marc_control_field("007", 1) + - self.get_marc_control_field("008", 21)) - - for (key, value) in self.get_types()["leader"].items(): - if re.match(f"^{key}", compare): - type_mapping.append(value) - - return type_mapping - def get_metadata_fields(self): """ Returns a list of metadata fields used by map_format, map_subject, @@ -95,11 +67,15 @@ def get_metadata_fields(self): def map_spatial(self) -> list: f651 = self.get_marc_data_fields(["651"], ["a"]) - return f651 + self.get_marc_data_fields(self.get_metadata_fields(), ["z"]) + values = f651 + self.get_marc_data_fields(self.get_metadata_fields(), ["z"]) + + # Stripping off trailing period + return [value[0:-1] if value[-1] == "." else value for value in values] def map_subject(self) -> list: fields = self.get_metadata_fields() - return [{"name": s} for s in self.get_marc_data_fields(fields, ["2"], exclude_subfields=True)] + return [{"name": s} for s in + self.get_marc_data_fields(fields, ["2"], exclude_subfields=True)] def map_temporal(self) -> list: f648 = self.get_marc_data_fields(["648"]) @@ -130,8 +106,9 @@ def map_extent(self) -> list: :return: A list of extent values. """ - return [", ".join(self.get_marc_data_fields(["300"]) + self.get_marc_data_fields(["340"], - ["b"]))] + return [", ".join( + self.get_marc_data_fields(["300"]) + self.get_marc_data_fields(["340"], + ["b"]))] def map_title(self) -> list: # 245, all subfields except c @@ -146,70 +123,6 @@ def map_title(self) -> list: return f245 + f242 + f240 - def get_types(self): - """ - Legacy code verbatim - :return: - """ - return { - "datafield": OrderedDict( - [("AJ", ("Journal", "Text")), - ("AN", ("Newspaper", "Text")), - ("BI", ("Biography", "Text")), - ("BK", ("Book", "Text")), - ("CF", ("Computer File", "Interactive Resource")), - ("CR", ("CDROM", "Interactive Resource")), - ("CS", ("Software", "Software")), - ("DI", ("Dictionaries", "Text")), - ("DR", ("Directories", "Text")), - ("EN", ("Encyclopedias", "Text")), - ("HT", ("HathiTrust", None)), - ("MN", ("Maps-Atlas", "Image")), - ("MP", ("Map", "Image")), - ("MS", ("Musical Score", "Text")), - ("MU", ("Music", "Text")), - ("MV", ("Archive", "Collection")), - ("MW", ("Manuscript", "Text")), - ("MX", ("Mixed Material", "Collection")), - ("PP", ("Photograph/Pictorial Works", "Image")), - ("RC", ("Audio CD", "Sound")), - ("RL", ("Audio LP", "Sound")), - ("RM", ("Music", "Sound")), - ("RS", ("Spoken word", "Sound")), - ("RU", (None, "Sound")), - ("SE", ("Serial", "Text")), - ("SX", ("Serial", "Text")), - ("VB", ("Video (Blu-ray)", "Moving Image")), - ("VD", ("Video (DVD)", "Moving Image")), - ("VG", ("Video Games", "Moving Image")), - ("VH", ("Video (VHS)", "Moving Image")), - ("VL", ("Motion Picture", "Moving Image")), - ("VM", ("Visual Material", "Image")), - ("WM", ("Microform", "Text")), - ("XC", ("Conference", "Text")), - ("XS", ("Statistics", "Text"))]), - "leader": OrderedDict( - [("am", ("Book", "Text")), - ("asn", ("Newspapers", "Text")), - ("as", ("Serial", "Text")), - ("aa", ("Book", "Text")), - ("a(?![mcs])", ("Serial", "Text")), - ("[cd].*", ("Musical Score", "Text")), - ("t.*", ("Manuscript", "Text")), - ("[ef].*", ("Maps", "Image")), - ("g.[st]", ("Photograph/Pictorial Works", "Image")), - ("g.[cdfo]", ("Film/Video", "Moving Image")), - ("g.*", (None, "Image")), - ("k.*", ("Photograph/Pictorial Works", "Image")), - ("i.*", ("Nonmusic", "Sound")), - ("j.*", ("Music", "Sound")), - ("r.*", (None, "Physical object")), - ("p[cs].*", (None, "Collection")), - ("m.*", (None, "Interactive Resource")), - ("o.*", (None, "Collection"))]) - } - - class UcbTindValidator(Validator): def setup(self): @@ -230,6 +143,7 @@ def setup(self): } ]) + class UcbTindVernacular(OaiVernacular): record_cls = UcbTindRecord validator = UcbTindValidator From 46b4e94cd0fd55f1aecc7c6979f4d0c844970bde Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Sun, 4 Feb 2024 12:08:29 -0800 Subject: [PATCH 11/19] [C] Add some comments --- metadata_mapper/mappers/marc/marc_mapper.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/metadata_mapper/mappers/marc/marc_mapper.py b/metadata_mapper/mappers/marc/marc_mapper.py index fafc0f54a..b00d5a2e1 100644 --- a/metadata_mapper/mappers/marc/marc_mapper.py +++ b/metadata_mapper/mappers/marc/marc_mapper.py @@ -133,19 +133,34 @@ def get_alternate_graphic_representation(tag: str, code: str, index: int, exclude_subfields = "exclude_subfields" in kwargs and kwargs[ "exclude_subfields"] + # Do we want process_value to have access to the 880 field values as well? + # If so, call process_value with value + the output of + # get_alternate_graphic_representation value_list = [[(process_value(value, field_tag, subfield[0]) if process_value else value)] + get_alternate_graphic_representation(field_tag, subfield[0], field_index, recurse) + + # Iterate the fields that have tags matching those requested for (field_tag, matching_fields) in self.get_marc_tag_value_map(field_tags).items() + + # Iterate the individual matches, tracking order in index for field_index, matching_field in enumerate(matching_fields) + + # Iterate the subfield codes in those fields for subfield in list(matching_field.subfields_as_dict().items()) + + # Iterate the values in those subfields for value in subfield[1] if + + # Ensure we're including only requested subfields subfield_matches(subfield[0], subfield_codes, exclude_subfields)] + # Flatten the output values = list(chain.from_iterable(value_list)) if isinstance(value_list, list) else [] + # Dedupe the output deduped_values = [] [deduped_values.append(value) for value in values if value not in deduped_values] From 90e5e800de5c2b59e408e6e0d4e7d4f709ea9979 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 7 Feb 2024 16:13:40 -0800 Subject: [PATCH 12/19] Sleep only in response to 503, not always --- .../fetch_registry_collections.py | 4 +++- metadata_fetcher/fetchers/Fetcher.py | 10 ++++----- metadata_fetcher/lambda_function.py | 21 +------------------ 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/metadata_fetcher/fetch_registry_collections.py b/metadata_fetcher/fetch_registry_collections.py index 30baf20ae..ba61147ff 100644 --- a/metadata_fetcher/fetch_registry_collections.py +++ b/metadata_fetcher/fetch_registry_collections.py @@ -1,6 +1,7 @@ import argparse import logging import sys +import traceback import requests @@ -75,7 +76,8 @@ def fetch_endpoint(url, limit=None, job_logger=logger): print(f"ERROR fetching collection { collection_id }: {e}") results[collection_id] = { 'status': 'error', - 'error_message': e + 'error_message': e, + 'traceback': traceback.format_exc() } continue diff --git a/metadata_fetcher/fetchers/Fetcher.py b/metadata_fetcher/fetchers/Fetcher.py index 34dd7e68a..de43dea22 100644 --- a/metadata_fetcher/fetchers/Fetcher.py +++ b/metadata_fetcher/fetchers/Fetcher.py @@ -4,7 +4,6 @@ from requests.adapters import HTTPAdapter, Retry from rikolti.utils.versions import put_vernacular_page import time -import os logger = logging.getLogger(__name__) @@ -54,12 +53,13 @@ def fetch_page(self): f"at {page.get('url')}" ) - # Added because collection 28011 was failing without this - print(f"Sleeping in {os.path.basename(__file__)}!") - time.sleep(1) - print("Done Sleeping!") try: response = requests.get(**page) + if response.status_code == 503: + # TIND sometimes throws a 503 error, so we'll sleep & try again; + # 28011 was most notorious + time.sleep(1) + response = requests.get(**page) response.raise_for_status() except requests.exceptions.HTTPError: raise FetchError( diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index ef541dc1c..28a66cf45 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -2,8 +2,6 @@ import json import logging import sys -import time -import os from .fetchers.Fetcher import Fetcher from rikolti.utils.versions import create_vernacular_version @@ -23,7 +21,7 @@ def import_fetcher(harvest_type): # AWS Lambda entry point -def fetch_collection(payload, vernacular_version, sleep=True) -> list[dict]: +def fetch_collection(payload, vernacular_version) -> list[dict]: """ returns a list of dicts with the following keys: document_count: int @@ -40,23 +38,6 @@ def fetch_collection(payload, vernacular_version, sleep=True) -> list[dict]: payload.update({'vernacular_version': vernacular_version}) next_page = payload fetch_status = [] - try: - if sleep: - print(f"Sleeping in {os.path.basename(__file__)}!") - time.sleep(1) - print("Done Sleeping!") - fetcher = fetcher_class(payload) - fetch_status.append(fetcher.fetch_page()) - except InvalidHarvestEndpoint as e: - logger.error(e) - fetch_status.append({ - 'status': 'error', - 'body': json.dumps({ - 'error': repr(e), - 'payload': payload - }) - }) - return fetch_status while not next_page.get('finished'): fetcher = fetcher_class(next_page) From cd4aa9c05e7837bc2f5eab4d4a100e3188c81ec9 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 7 Feb 2024 16:36:29 -0800 Subject: [PATCH 13/19] Put OAI mapper back slightly --- metadata_mapper/mappers/oai/oai_mapper.py | 33 +++++++++++------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/metadata_mapper/mappers/oai/oai_mapper.py b/metadata_mapper/mappers/oai/oai_mapper.py index 2f72986a4..07c30bb07 100644 --- a/metadata_mapper/mappers/oai/oai_mapper.py +++ b/metadata_mapper/mappers/oai/oai_mapper.py @@ -134,32 +134,29 @@ def map_type(self) -> list: class OaiVernacular(Vernacular): namespaces = {"oai2": "http://www.openarchives.org/OAI/2.0/"} - def parse(self, api_response: requests.Response) -> list[Record]: - api_response = bytes(api_response, "utf-8") - page = etree.XML(api_response) + def parse(self, api_response: str) -> list[Record]: + api_response_b = bytes(api_response, "utf-8") + page = etree.XML(api_response_b) - record_elements = self._get_record_elements(page) - request_url = self._get_request_url(page) - records = self._create_records(record_elements, request_url) - - return self.get_records(records) + request_elem = page.find("oai2:request", namespaces=self.namespaces) + request_url = request_elem.text if request_elem is not None else None - def _get_record_elements(self, page: etree.ElementBase) -> list[etree.ElementBase]: - return ( + record_elements = ( page .find("oai2:ListRecords", namespaces=self.namespaces) .findall("oai2:record", namespaces=self.namespaces) ) - def _get_request_url(self, page: etree.ElementBase) -> Optional[str]: - request_elem = page.find("oai2:request", namespaces=self.namespaces) - return request_elem.text if request_elem is not None else None + records = [self._process_record(re, request_url) + for re in record_elements] + records = list(filter(None, records)) + + return self.get_records(records) - def _create_records(self, record_elements: list[etree.ElementBase], - request_url: str) -> list[Record]: - return [self._process_record(re, request_url) for re in record_elements] - def _process_record(self, record_element, request_url): + def _process_record(self, + record_element: etree.ElementBase, + request_url: Optional[str]) -> Optional[dict]: sickle_rec = models.Record(record_element) sickle_header = sickle_rec.header @@ -173,7 +170,7 @@ def _process_record(self, record_element, request_url): return record - def strip_metadata(self, record_metadata): + def strip_metadata(self, record_metadata: dict) -> dict: stripped = {} for key, value in record_metadata.items(): if isinstance(value, str): From 0f3908c8406f310218a98645b9100ab1adeec516 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 7 Feb 2024 16:37:44 -0800 Subject: [PATCH 14/19] filter deleted records prior to searching for marc record namespace --- metadata_mapper/mappers/marc/ucb_tind_mapper.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/metadata_mapper/mappers/marc/ucb_tind_mapper.py b/metadata_mapper/mappers/marc/ucb_tind_mapper.py index ae25e1211..001e34472 100644 --- a/metadata_mapper/mappers/marc/ucb_tind_mapper.py +++ b/metadata_mapper/mappers/marc/ucb_tind_mapper.py @@ -1,10 +1,11 @@ -from .marc_mapper import MarcRecord, MarcVernacular +from .marc_mapper import MarcRecord from ..oai.oai_mapper import OaiVernacular from sickle import models from pymarc import parse_xml_to_array from lxml import etree from io import StringIO +from typing import Optional from ..mapper import Validator @@ -148,7 +149,9 @@ class UcbTindVernacular(OaiVernacular): record_cls = UcbTindRecord validator = UcbTindValidator - def _process_record(self, record_element: list, request_url: str) -> UcbTindRecord: + def _process_record(self, + record_element: etree.ElementBase, + request_url: Optional[str]) -> Optional[dict]: """ Process a record element and extract relevant information. @@ -156,6 +159,11 @@ def _process_record(self, record_element: list, request_url: str) -> UcbTindReco :param request_url: The URL of the request. :return: A dictionary containing the extracted information from the record. """ + sickle_rec = models.Record(record_element) + sickle_header = sickle_rec.header + if sickle_header.deleted: + return None + marc_record_element = record_element.find(".//marc:record", namespaces={ "marc": "http://www.loc.gov/MARC21/slim"}) marc_record_string = etree.tostring(marc_record_element, @@ -167,11 +175,6 @@ def _process_record(self, record_element: list, request_url: str) -> UcbTindReco f'{marc_record_string}' '') - sickle_rec = models.Record(record_element) - sickle_header = sickle_rec.header - - if sickle_header.deleted: - return None record = { "datestamp": sickle_header.datestamp, From 55d89650e701f7ff3f59c4c88ca46e706cd67804 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 7 Feb 2024 16:39:29 -0800 Subject: [PATCH 15/19] Update DAG requirements --- dags/requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dags/requirements.txt b/dags/requirements.txt index 3ede404d1..62b26ae84 100644 --- a/dags/requirements.txt +++ b/dags/requirements.txt @@ -4,6 +4,10 @@ opensearch-py requests sickle python-dotenv +beautifulsoup4 +lxml +pymarc +MarkupSafe apache-airflow-providers-docker apache-airflow-providers-google apache-airflow-providers-amazon \ No newline at end of file From a725f571fa4a4f3a528f90781d8c4e00ac9318d6 Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 7 Feb 2024 16:39:59 -0800 Subject: [PATCH 16/19] If no mapped pages, log warning and continue --- dags/utils_by_mapper_type.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dags/utils_by_mapper_type.py b/dags/utils_by_mapper_type.py index e3732977e..01928fff1 100644 --- a/dags/utils_by_mapper_type.py +++ b/dags/utils_by_mapper_type.py @@ -120,6 +120,12 @@ def map_endpoint_task(endpoint, fetched_versions, params=None): mapper_job_results = map_endpoint(endpoint, fetched_versions, limit) mapped_versions = {} for mapper_job_result in mapper_job_results: + if not mapper_job_result.get('mapped_page_paths'): + logging.warning( + f"{mapper_job_result['collection_id']:<6}: " + "no mapped pages written" + ) + continue mapped_version = get_version( mapper_job_result['collection_id'], mapper_job_result['mapped_page_paths'][0] From d8bb3ca8a54b4127c870931e4fb28a0b7bce792b Mon Sep 17 00:00:00 2001 From: amy wieliczka Date: Wed, 7 Feb 2024 16:45:41 -0800 Subject: [PATCH 17/19] No changes to metadata_mapper/lambda_function in this PR --- metadata_mapper/lambda_function.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata_mapper/lambda_function.py b/metadata_mapper/lambda_function.py index 0b90c2705..ac558d80e 100644 --- a/metadata_mapper/lambda_function.py +++ b/metadata_mapper/lambda_function.py @@ -29,16 +29,15 @@ def import_vernacular_reader(mapper_type): f".mappers.{'.'.join(mapper_parent_modules)}.{snake_cased_mapper_name}_mapper", package=__package__ ) + mapper_type_words = snake_cased_mapper_name.split('_') class_type = ''.join([word.capitalize() for word in mapper_type_words]) - vernacular_class = getattr( mapper_module, f"{class_type}Vernacular") if not issubclass(vernacular_class, Vernacular): print(f"{mapper_type} not a subclass of Vernacular", file=sys.stderr) exit() - return vernacular_class From 3a1e8d32caef36d326298143e8f0da7c4dc03a91 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Wed, 21 Feb 2024 12:32:03 -0800 Subject: [PATCH 18/19] Tweak MARC mapper * Dedupe before flattening values * Retitle a function to marc_tags_as_dict * Update documentation --- metadata_mapper/mappers/marc/marc_mapper.py | 34 +++++++++++---------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/metadata_mapper/mappers/marc/marc_mapper.py b/metadata_mapper/mappers/marc/marc_mapper.py index b00d5a2e1..23be9d2e7 100644 --- a/metadata_mapper/mappers/marc/marc_mapper.py +++ b/metadata_mapper/mappers/marc/marc_mapper.py @@ -5,6 +5,7 @@ import re from itertools import chain + class MarcRecord(Record): def UCLDC_map(self): return { @@ -27,7 +28,7 @@ def get_marc_control_field(self, field_tag: str, index: int = None) -> list: field_tag) < 100 else "" values = [v[0].value() for (k, v) - in self.get_marc_tag_value_map([data_field_tag]).items() + in self.marc_tags_as_dict([data_field_tag]).items() if len(v) > 0] if not values: @@ -49,14 +50,16 @@ def get_marc_data_fields(self, field_tags: list, subfield_codes=[], recurse=True TODO: Variable name meaning becomes quite fuzzy in the heart of this function. Most variables could stand to be renamed. - Get the values of specified subfields from given MARC fields. This allows - control fields too. + In most cases, this returns the Cartesian product of the provided `field_tags` + and `subfield codes`. If `recurse` is true, it will augment to include values + from field 880. Note the special handling of code `6`. Set the `exclude_subfields` kwarg to exclude the specified subfield_codes. Set the `process_value` kwarg to pass the value through your own code to - do transformations based on the field tag, code and value. See `map_subject` for - an example. + do transformations based on the field tag, code and value. There isn't an + example of this in use currently, but it could be useful for debugging or + for context-based transformations. :param recurse: Indicates whether alternate graphic representations (field 880) should be sought. This is used here to prevent infinite loops @@ -111,7 +114,7 @@ def get_alternate_graphic_representation(tag: str, code: str, index: int, if not match: return [] - all_880 = self.get_marc_tag_value_map(["880"])["880"] + all_880 = self.marc_tags_as_dict(["880"])["880"] index_880 = int(match.group(1)) - 1 # 880 indices start at 1 if not all_880 or index_880 >= len(all_880): @@ -142,7 +145,7 @@ def get_alternate_graphic_representation(tag: str, code: str, index: int, # Iterate the fields that have tags matching those requested for (field_tag, matching_fields) in - self.get_marc_tag_value_map(field_tags).items() + self.marc_tags_as_dict(field_tags).items() # Iterate the individual matches, tracking order in index for field_index, matching_field in enumerate(matching_fields) @@ -157,17 +160,18 @@ def get_alternate_graphic_representation(tag: str, code: str, index: int, # Ensure we're including only requested subfields subfield_matches(subfield[0], subfield_codes, exclude_subfields)] - # Flatten the output - values = list(chain.from_iterable(value_list)) if isinstance(value_list, list) else [] - # Dedupe the output deduped_values = [] - [deduped_values.append(value) for value in values + [deduped_values.append(value) for value in value_list if value not in deduped_values] - return deduped_values + # Flatten the output + flattened_values = list(chain.from_iterable(deduped_values)) if ( + isinstance(deduped_values, list)) else [] + + return flattened_values - def get_marc_tag_value_map(self, field_tags: list) -> dict: + def marc_tags_as_dict(self, field_tags: list) -> dict: """ Get the specified MARC fields from the source_metadata, mapping by field tag @@ -181,9 +185,7 @@ def get_marc_leader(self, leader_key: str): """ Retrieve the value of specified leader key from the MARC metadata. - Couple things: - * We're not accommodating passing a slice, which pymarc can handle should it be necessary - * Both + We're not accommodating passing a slice, which pymarc can handle should it be necessary :param leader_key: The key of the leader field to retrieve. :type leader_key: str From b76d294d163e6bfd4cebc62c4988fe85007c7da5 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Wed, 21 Feb 2024 12:53:19 -0800 Subject: [PATCH 19/19] Add references to MARC documentation --- metadata_mapper/mappers/marc/marc_mapper.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/metadata_mapper/mappers/marc/marc_mapper.py b/metadata_mapper/mappers/marc/marc_mapper.py index 23be9d2e7..e9472a9a3 100644 --- a/metadata_mapper/mappers/marc/marc_mapper.py +++ b/metadata_mapper/mappers/marc/marc_mapper.py @@ -13,6 +13,9 @@ def UCLDC_map(self): def get_marc_control_field(self, field_tag: str, index: int = None) -> list: """ + + See: https://www.loc.gov/marc/bibliographic/bd00x.html + Get MARC control field. Returns an empty string if: * Control field isn't set * No value exists at the requested index @@ -50,6 +53,9 @@ def get_marc_data_fields(self, field_tags: list, subfield_codes=[], recurse=True TODO: Variable name meaning becomes quite fuzzy in the heart of this function. Most variables could stand to be renamed. + `Data fields` is not a specific term in MARC. This function really will accept + any field tags. See https://www.loc.gov/marc/bibliographic/ for all fields. + In most cases, this returns the Cartesian product of the provided `field_tags` and `subfield codes`. If `recurse` is true, it will augment to include values from field 880. Note the special handling of code `6`. @@ -83,7 +89,11 @@ def subfield_matches(check_code: str, subfield_codes: list, excluded based on the subfield_codes and exclude_subfields parameters. """ - # Always exclude subfield 6 unless it is explicitly listed + # Always exclude subfield 6 (Linkage, + # see: https://www.loc.gov/marc/bibliographic/ecbdcntf.html) unless it is + # explicitly listed. Not excluding this was producing results that + # were not expected. Note the explicit inclusion of 6 in + # `get_alternate_graphic_representation()`, below. if check_code == "6" and "6" not in subfield_codes: return False if not subfield_codes: @@ -96,7 +106,9 @@ def subfield_matches(check_code: str, subfield_codes: list, def get_alternate_graphic_representation(tag: str, code: str, index: int, recurse=True) -> list: """ - This is where field 880 is handled + This is where field 880 is handled. + See: https://www.loc.gov/marc/bibliographic/bd880.html + :param tag: :param code: :param index: @@ -184,6 +196,7 @@ def marc_tags_as_dict(self, field_tags: list) -> dict: def get_marc_leader(self, leader_key: str): """ Retrieve the value of specified leader key from the MARC metadata. + See: https://www.loc.gov/marc/bibliographic/bdleader.html We're not accommodating passing a slice, which pymarc can handle should it be necessary