diff --git a/biodatasets/tac2017/tac2017.py b/biodatasets/tac2017/tac2017.py new file mode 100644 index 00000000..ced122e8 --- /dev/null +++ b/biodatasets/tac2017/tac2017.py @@ -0,0 +1,682 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Drug labels (prescribing information or package inserts) describe what a particular medicine +is supposed to do, who should or should not take it, how to use it, and specific safety concerns. +The US Food and Drug Administration (FDA) publishes regulations governing the content and format +of this information to provide recommendations for applicants developing labeling for new drugs +and revising labeling for already approved drugs. One of the major aspects of drug information +are safety concerns in the form of Adverse Drug Reactions (ADRs). In this evaluation, we are +focusing on extraction of ADRs from the prescription drug labels. +""" + +import os +import xml.etree.ElementTree as ET +from typing import Dict, List, Tuple + +import datasets + +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{, + author = {Kirk Roberts and + Dina Demner-Fushman and + Joseph M. Tonning}, + title = {Overview of the TAC 2017 Adverse Reaction Extraction from Drug Labels Track}, + journal = {Proceedings of the Text Analysis Conference (TAC) 2017, November 13-14 2017, Gaithersburg MD USA}, + volume = {}, + year = {2017}, + url = {https://tac.nist.gov/publications/2017/additional.papers/TAC2017.ADR_overview.proceedings.pdf}, + doi = {}, + biburl = {}, + bibsource = {} +} +""" +_DATASETNAME = "tac2017" + +_DESCRIPTION = """\ +This dataset is designed for extraction of ADRs from prescription drug labels. +""" +_HOMEPAGE = "https://bionlp.nlm.nih.gov/tac2017adversereactions/" + +_LICENSE = "None provided." + +_URLS = { + "tac2017": "https://bionlp.nlm.nih.gov/tac2017adversereactions/train_xml.tar.gz", +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_DISAMBIGUATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + + +class Tac2017Dataset(datasets.GeneratorBasedBuilder): + """The TAC 2017 dataset is designed for extraction of ADRs from prescription drug labels.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="tac2017_source", + version=SOURCE_VERSION, + description="TAC 2017 source schema", + schema="source", + subset_id="tac2017", + ), + BigBioConfig( + name="tac2017_bigbio_kb", + version=BIGBIO_VERSION, + description="TAC 2017 BigBio schema", + schema="bigbio_kb", + subset_id="tac2017", + ), + ] + + DEFAULT_CONFIG_NAME = "tac2017_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source": + features = datasets.Features( + { + "drug": datasets.Value("string"), + "text": { + "sections": [ + { + "id": datasets.Value("string"), + "name": datasets.Value("string"), + "section_text": datasets.Value("string"), + } + ], + }, + "mentions": [ + { + "id": datasets.Value("string"), + "source_id": datasets.Value("string"), + "type": datasets.Value("string"), + "section_id": datasets.Value("string"), + "start": datasets.Value("int32"), + "len": datasets.Value("int32"), + "str": datasets.Value("string"), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1": datasets.Value("string"), + "arg2": datasets.Value("string"), + } + ], + "reactions": [ + { + "id": datasets.Value("string"), + "str": datasets.Value("string"), + "normalization": { + "id": datasets.Value("string"), + "meddra_pt": datasets.Value("string"), + "meddra_pt_id": datasets.Value("string"), + "meddra_llt": datasets.Value("string"), + "meddra_llt_id": datasets.Value("string"), + }, + } + ], + } + ) + + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + train_fpaths = { + "train_xml/ADCETRIS.xml", + "train_xml/BEPREVE.xml", + "train_xml/CLEVIPREX.xml", + "train_xml/DYSPORT.xml", + "train_xml/FERRIPROX.xml", + "train_xml/ILARIS.xml", + "train_xml/KALYDECO.xml", + "train_xml/ONFI.xml", + "train_xml/SAPHRIS.xml", + "train_xml/TIVICAY.xml", + "train_xml/VIZAMYL.xml", + "train_xml/ZYKADIA.xml", + "train_xml/ADREVIEW.xml", + "train_xml/BESIVANCE.xml", + "train_xml/COARTEM.xml", + "train_xml/EDARBI.xml", + "train_xml/FIRAZYR.xml", + "train_xml/IMBRUVICA.xml", + "train_xml/KYPROLIS.xml", + "train_xml/OTEZLA.xml", + "train_xml/SIMPONI.xml", + "train_xml/TOVIAZ.xml", + "train_xml/VORAXAZE.xml", + "train_xml/ZYTIGA.xml", + "train_xml/AFINITOR.xml", + "train_xml/BLINCYTO.xml", + "train_xml/COMETRIQ.xml", + "train_xml/ELIQUIS.xml", + "train_xml/FULYZAQ.xml", + "train_xml/INLYTA.xml", + "train_xml/LUMIZYME.xml", + "train_xml/PICATO.xml", + "train_xml/SIRTURO.xml", + "train_xml/TREANDA.xml", + "train_xml/XALKORI.xml", + "train_xml/AMPYRA.xml", + "train_xml/BOSULIF.xml", + "train_xml/DALVANCE.xml", + "train_xml/ENTEREG.xml", + "train_xml/GADAVIST.xml", + "train_xml/INTELENCE.xml", + "train_xml/MULTAQ.xml", + "train_xml/POTIGA.xml", + "train_xml/STENDRA.xml", + "train_xml/TRULICITY.xml", + "train_xml/XEOMIN.xml", + "train_xml/AMYVID.xml", + "train_xml/BREO.xml", + "train_xml/DATSCAN.xml", + "train_xml/EOVIST.xml", + "train_xml/GILENYA.xml", + "train_xml/INVOKANA.xml", + "train_xml/NATAZIA.xml", + "train_xml/PRADAXA.xml", + "train_xml/STRIBILD.xml", + "train_xml/TUDORZA.xml", + "train_xml/XIAFLEX.xml", + "train_xml/APTIOM.xml", + "train_xml/CARBAGLU.xml", + "train_xml/DIFICID.xml", + "train_xml/ERWINAZE.xml", + "train_xml/GILOTRIF.xml", + "train_xml/JARDIANCE.xml", + "train_xml/NESINA.xml", + "train_xml/PRISTIQ.xml", + "train_xml/TAFINLAR.xml", + "train_xml/ULESFIA.xml", + "train_xml/XTANDI.xml", + "train_xml/ARCAPTA.xml", + "train_xml/CERDELGA.xml", + "train_xml/DOTAREM.xml", + "train_xml/EYLEA.xml", + "train_xml/GRANIX.xml", + "train_xml/JEVTANA.xml", + "train_xml/NEURACEQ.xml", + "train_xml/PROLIA.xml", + "train_xml/TANZEUM.xml", + "train_xml/ULORIC.xml", + "train_xml/YERVOY.xml", + "train_xml/BELEODAQ.xml", + "train_xml/CHOLINE.xml", + "train_xml/DUAVEE.xml", + "train_xml/FANAPT.xml", + "train_xml/HALAVEN.xml", + "train_xml/JUBLIA.xml", + "train_xml/NORTHERA.xml", + "train_xml/PROMACTA.xml", + "train_xml/TECFIDERA.xml", + "train_xml/VICTRELIS.xml", + "train_xml/ZERBAXA.xml", + "train_xml/BENLYSTA.xml", + "train_xml/CIMZIA.xml", + "train_xml/DUREZOL.xml", + "train_xml/FARXIGA.xml", + "train_xml/HORIZANT.xml", + "train_xml/KALBITOR.xml", + "train_xml/NULOJIX.xml", + "train_xml/QUTENZA.xml", + "train_xml/TEFLARO.xml", + "train_xml/VIMIZIM.xml", + "train_xml/ZYDELIG.xml", + } + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={"files": [os.path.join(data_dir, path) for path in train_fpaths], "split": "train"}, + ), + ] + + def _generate_example_sections(self, uid, source_sections_tree): + """ + Parse sections XML + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_sections_tree : etree object + XML of drug label sections + + Returns + ---------- + int + updated unique identifier + dict + drug label sections information + """ + + sections = [] + + for source_section in source_sections_tree: + source_section_id = source_section.attrib["id"] + source_section_name = source_section.attrib["name"] + source_section_text = source_section.text + section = {"id": source_section_id, "name": source_section_name, "section_text": source_section_text} + sections.append(section) + uid += 1 + + return (uid, sections) + + def _generate_example_mentions(self, uid, source_mentions_tree): + """ + Parse mentions XML + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_mentions_tree : etree object + XML of drug label ADR mentions + + Returns + ---------- + int + updated unique identifier + dict + ADR mentions information + """ + mentions = [] + for source_mention in source_mentions_tree: + source_mention_id = source_mention.attrib["id"] + source_mention_section_id = source_mention.attrib["section"] + source_mention_type = source_mention.attrib["type"] + source_mention_start = source_mention.attrib["start"] + source_mention_len = source_mention.attrib["len"] + source_mention_str = source_mention.attrib["str"] + + if "," in source_mention_start: + source_mention_starts = source_mention_start.split(",") + source_mention_lens = source_mention_len.split(",") + i = 0 + for source_mention_start in source_mention_starts: + mention = { + "id": str(uid), + "source_id": source_mention_id, + "type": source_mention_type, + "section_id": source_mention_section_id, + "start": int(source_mention_start), + "len": int(source_mention_lens[i]), + "str": source_mention_str, + } + mentions.append(mention) + uid += 1 + i+=1 + else: + mention = { + "id": str(uid), + "source_id": source_mention_id, + "type": source_mention_type, + "section_id": source_mention_section_id, + "start": int(source_mention_start), + "len": int(source_mention_len), + "str": source_mention_str, + } + mentions.append(mention) + uid += 1 + return (uid, mentions) + + def _generate_example_relations(self, uid, source_relations_tree): + """ + Parse relations XML + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_relations_tree : etree object + XML of drug label ADR relations + + Returns + ---------- + int + updated unique identifier + dict + drug label relations information + """ + relations = [] + for source_relation in source_relations_tree: + source_relation_id = source_relation.attrib["id"] + source_relation_type = source_relation.attrib["type"] + source_relation_arg1 = source_relation.attrib["arg1"] + source_relation_arg2 = source_relation.attrib["arg2"] + + relation = { + "id": source_relation_id, + "type": source_relation_type, + "arg1": source_relation_arg1, + "arg2": source_relation_arg2, + } + relations.append(relation) + uid += 1 + return (uid, relations) + + def _generate_example_reactions(self, uid, source_reactions_tree): + """ + Parse reactions XML + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_reactions_tree : etree object + XML of drug label reaction normalizations to MedDRA terms + + Returns + ---------- + int + updated unique identifier + dict + reactions normalization information + """ + reactions = [] + for source_reaction in source_reactions_tree: + source_reaction_id = source_reaction.attrib["id"] + source_reaction_str = source_reaction.attrib["str"] + + reaction = {"id": source_reaction_id, "str": source_reaction_str} + + for source_reaction_normalization in source_reaction: + source_reaction_normalization_id = source_reaction_normalization.attrib["id"] + + normalization = {"id": source_reaction_normalization_id} + + if "meddra_pt" in source_reaction_normalization.attrib: + source_reaction_normalization_meddra_pt = source_reaction_normalization.attrib["meddra_pt"] + source_reaction_normalization_meddra_pt_id = source_reaction_normalization.attrib["meddra_pt_id"] + normalization["meddra_pt"] = source_reaction_normalization_meddra_pt + normalization["meddra_pt_id"] = source_reaction_normalization_meddra_pt_id + else: + normalization["meddra_pt"] = "" + normalization["meddra_pt_id"] = "" + + if "meddra_llt" in source_reaction_normalization.attrib: + source_reaction_normalization_meddra_llt = source_reaction_normalization.attrib["meddra_llt"] + source_reaction_normalization_meddra_llt_id = source_reaction_normalization.attrib["meddra_llt_id"] + normalization["meddra_llt"] = source_reaction_normalization_meddra_llt + normalization["meddra_llt_id"] = source_reaction_normalization_meddra_llt_id + else: + normalization["meddra_llt"] = "" + normalization["meddra_llt_id"] = "" + reaction["normalization"] = normalization + uid += 1 + reactions.append(reaction) + uid += 1 + return (uid, reactions) + + def _generate_example_kb_passages(self, uid, drug_name, source_sections_tree): + """ + Parse sections XML into passages for KB schema + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_sections_tree : etree object + XML of drug label sections + + Returns + ---------- + int + updated unique identifier + dict + KB schema passages information + """ + passages = [] + + overall_text = "" + + for source_section in source_sections_tree: + passage_id = drug_name + "_" + source_section.attrib["id"] + passage_type = source_section.attrib["name"] + passage_text = source_section.text + + passage_offsets = (len(overall_text), len(passage_text)+len(overall_text)) + passage = {"id": passage_id, "type": passage_type, "text": [passage_text], "offsets": [passage_offsets]} + passages.append(passage) + uid += 1 + overall_text = overall_text+passage_text + + return (uid, passages) + + def _generate_example_kb_entities(self, uid, drug_name, source_mentions_tree, normalizations, passages): + """ + Parse mentions XML into entities for KB schema, including normalizations from source "reactions" data + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_mentions_tree : etree object + XML of drug label ADR mentions + + Returns + ---------- + int + updated unique identifier + dict + KB schema entities information + """ + entities = [] + for source_mention in source_mentions_tree: + entity_type = source_mention.attrib["type"] + entity_text = source_mention.attrib["str"] + entity_id = source_mention.attrib["id"] + passage = source_mention.attrib["section"] + + passage_num = int(passage[1:]) + + pretext = "" + + if passage_num>1: + texts = [passage["text"][0] for passage in passages for i in range(0,passage_num-1) if passage["id"] == "S"+str(i)] + pretext = "".join(text for text in texts) + + relevant_normalizations = [ + normalization for normalization in normalizations if entity_text == normalization["str"] + ] + + source_mention_start = source_mention.attrib["start"] + source_mention_len = source_mention.attrib["len"] + + if "," in source_mention_start: + source_mention_starts = source_mention_start.split(",") + source_mention_lens = source_mention_len.split(",") + + for source_mention_start in source_mention_starts: + entity = { + "id": drug_name + "_" + entity_id, + "type": entity_type, + "text": [entity_text], + "offsets": [ + [len(pretext) + int(source_mention_start), len(pretext) + int(source_mention_start) + int(source_mention_lens[0])] + ], + "normalized": [ + {"db_name": "meddra", "db_id": rn["meddra_id"]} for rn in relevant_normalizations + ], + } + entities.append(entity) + uid += 1 + else: + entity = { + "id": drug_name + "_" + entity_id, + "type": entity_type, + "text": [entity_text], + "offsets": [[len(pretext) + int(source_mention_start), len(pretext) + int(source_mention_start) + int(source_mention_len)]], + "normalized": [{"db_name": "meddra", "db_id": rn["meddra_id"]} for rn in relevant_normalizations], + } + entities.append(entity) + uid += 1 + return (uid, entities) + + def _generate_kb_entity_normalizations(self, uid, source_reactions_tree): + """ + Parse reactions XML into entity normalizations for KB schema + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_reactions_tree : etree object + XML of drug label reaction normalizations to MedDRA terms + + Returns + ---------- + int + updated unique identifier + dict + reactions normalization information for KB schema + """ + normalizations = [] + for source_reaction in source_reactions_tree: + + for source_reaction_normalization in source_reaction: + if "meddra_pt" in source_reaction_normalization.attrib: + normalized_term = {} + normalized_term["str"] = source_reaction.attrib["str"] + source_reaction_normalization_meddra_pt_id = source_reaction_normalization.attrib["meddra_pt_id"] + normalized_term["meddra_id"] = source_reaction_normalization_meddra_pt_id + normalizations.append(normalized_term) + + if "meddra_llt" in source_reaction_normalization.attrib: + normalized_term = {} + normalized_term["str"] = source_reaction.attrib["str"] + source_reaction_normalization_meddra_llt_id = source_reaction_normalization.attrib["meddra_llt_id"] + normalized_term["meddra_id"] = source_reaction_normalization_meddra_llt_id + normalizations.append(normalized_term) + return(uid, normalizations) + + def _generate_example_kb_relations(self, uid, drug_name, source_relations_tree): + """ + Parse relations XML for KB schema + + Parameters + ---------- + uid : int + unique identifier being updated with each execution + source_relations_tree : etree object + XML of drug label ADR relations + + Returns + ---------- + int + updated unique identifier + dict + drug label relations information for KB schema + """ + relations = [] + for source_relation in source_relations_tree: + source_relation_id = source_relation.attrib["id"] + source_relation_type = source_relation.attrib["type"] + source_relation_arg1 = source_relation.attrib["arg1"] + source_relation_arg2 = source_relation.attrib["arg2"] + + relation = { + "id": source_relation_id, + "type": source_relation_type, + "arg1_id": drug_name + "_" + source_relation_arg1, + "arg2_id": drug_name + "_" + source_relation_arg2, + "normalized": [], + } + uid += 1 + relations.append(relation) + return (uid, relations) + + def _generate_examples(self, files, split: str) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + uid = 0 + + for file in files: + with open(file) as xml_file: + source_tree = ET.parse(xml_file) + source_label = source_tree.getroot() + source_drug_name = source_label.attrib["drug"] + + source_text = [source_child for source_child in source_label if source_child.tag == "Text"][0] + source_mentions = [source_child for source_child in source_label if source_child.tag == "Mentions"][0] + source_relations = [source_child for source_child in source_label if source_child.tag == "Relations"][ + 0 + ] + source_reactions = [source_child for source_child in source_label if source_child.tag == "Reactions"][ + 0 + ] + + if self.config.schema == "source": + example = {"drug": source_drug_name, "text": {}, "mentions": [], "relations": [], "reactions": []} + uid, sections = self._generate_example_sections(uid, source_text) + example["text"] = {"sections": sections} + + uid, mentions = self._generate_example_mentions(uid, source_mentions) + example["mentions"] = mentions + + uid, relations = self._generate_example_relations(uid, source_relations) + example["relations"] = relations + + uid, reactions = self._generate_example_reactions(uid, source_reactions) + example["reactions"] = reactions + yield uid, example + + elif self.config.schema == "bigbio_kb": + + example = { + "id": uid, + "document_id": source_drug_name, + "passages": [], + "entities": [], + "relations": [], + "events": [], + "coreferences": [], + } + + uid, entity_normalizations = self._generate_kb_entity_normalizations(uid, source_reactions) + + uid, passages = self._generate_example_kb_passages(uid, source_drug_name, source_text) + example["passages"] = passages + + uid, entities = self._generate_example_kb_entities( + uid, source_drug_name, source_mentions, entity_normalizations, passages + ) + example["entities"] = entities + + uid, relations = self._generate_example_kb_relations(uid, source_drug_name, source_relations) + example["relations"] = relations + + yield uid, example