Skip to content

Commit

Permalink
Adapt loader to changes in source data (#903)
Browse files Browse the repository at this point in the history
  • Loading branch information
leonweber authored Nov 6, 2023
1 parent 0bf8d6c commit 8b8e2d6
Showing 1 changed file with 5 additions and 10 deletions.
15 changes: 5 additions & 10 deletions bigbio/hub/hub_repos/sem_eval_2024_task_2/sem_eval_2024_task_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,6 @@ def _info(self) -> datasets.DatasetInfo:
"secondary_id": datasets.Value("string"),
"statement": datasets.Value("string"),
"label": datasets.Value("string"),
"primary_evidence_index": [datasets.Value("int64")],
"secondary_evidence_index": [datasets.Value("int64")],
}
)
elif self.config.schema == "ct":
Expand Down Expand Up @@ -212,7 +210,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
def _generate_examples(self, data_dir: Path, split: str, config) -> Tuple[int, Dict]:
"""Yields examples as (id, example) tuples."""
if self.config.schema == "source":
with open(data_dir / f"training_data/{split}.json", "r") as f:
with open(data_dir / f"{split}.json", "r") as f:
raw_data = json.load(f)
for id_ in sorted(raw_data):
data_dict = {k.lower().replace(" ", "_"): v for k, v in raw_data[id_].items()} # make keys align with schema
Expand All @@ -221,15 +219,12 @@ def _generate_examples(self, data_dir: Path, split: str, config) -> Tuple[int, D
if "secondary_id" not in data_dict:
data_dict["secondary_id"] = ""

if "secondary_evidence_index" not in data_dict:
data_dict["secondary_evidence_index"] = []

data_dict["id"] = id_

yield id_, data_dict

elif self.config.schema == "ct": # yield only raw clinical trial data
ct_files = sorted((data_dir / "training_data" / "CT json").glob("*.json"))
ct_files = sorted((data_dir / "CT json").glob("*.json"))
for ct_file in ct_files:
with open(ct_file, "r") as f:
raw_data = json.load(f)
Expand All @@ -239,20 +234,20 @@ def _generate_examples(self, data_dir: Path, split: str, config) -> Tuple[int, D
yield id_, data_dict

elif self.config.schema == "bigbio_TE": # combine labels and clinical trial text data here
with open(data_dir / f"training_data/{split}.json", "r") as f:
with open(data_dir / f"{split}.json", "r") as f:
raw_label_data = json.load(f)

for id_ in sorted(raw_label_data):
primary_id = raw_label_data[id_]["Primary_id"]
secondary_id = raw_label_data[id_].get("Secondary_id")

with open(data_dir / f"training_data/CT json" / f"{primary_id}.json", "r") as f:
with open(data_dir / f"CT json" / f"{primary_id}.json", "r") as f:
raw_ct_data = json.load(f)

text_primary = _get_text(raw_ct_data, section=raw_label_data[id_]["Section_id"])

if secondary_id:
with open(data_dir / f"training_data/CT json" / f"{secondary_id}.json", "r") as f:
with open(data_dir / f"CT json" / f"{secondary_id}.json", "r") as f:
raw_ct_data = json.load(f)
text_secondary = _get_text(raw_ct_data, section=raw_label_data[id_]["Section_id"])
else:
Expand Down

0 comments on commit 8b8e2d6

Please sign in to comment.