Adapt loader to changes in source data (#903)

bigscience-workshop · Nov 6, 2023 · 8b8e2d6 · 8b8e2d6
1 parent 0bf8d6c
commit 8b8e2d6
Showing 1 changed file with 5 additions and 10 deletions.
diff --git a/bigbio/hub/hub_repos/sem_eval_2024_task_2/sem_eval_2024_task_2.py b/bigbio/hub/hub_repos/sem_eval_2024_task_2/sem_eval_2024_task_2.py
@@ -138,8 +138,6 @@ def _info(self) -> datasets.DatasetInfo:
                     "secondary_id": datasets.Value("string"),
                     "statement": datasets.Value("string"),
                     "label": datasets.Value("string"),
-                    "primary_evidence_index": [datasets.Value("int64")],
-                    "secondary_evidence_index": [datasets.Value("int64")],
                 }
             )
         elif self.config.schema == "ct":
@@ -212,7 +210,7 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
     def _generate_examples(self, data_dir: Path, split: str, config) -> Tuple[int, Dict]:
         """Yields examples as (id, example) tuples."""
         if self.config.schema == "source":
-            with open(data_dir / f"training_data/{split}.json", "r") as f:
+            with open(data_dir / f"{split}.json", "r") as f:
                 raw_data = json.load(f)
             for id_ in sorted(raw_data):
                 data_dict = {k.lower().replace(" ", "_"): v for k, v in raw_data[id_].items()} # make keys align with schema
@@ -221,15 +219,12 @@ def _generate_examples(self, data_dir: Path, split: str, config) -> Tuple[int, D
                 if "secondary_id" not in data_dict:
                     data_dict["secondary_id"] = ""
 
-                if "secondary_evidence_index" not in data_dict:
-                    data_dict["secondary_evidence_index"] = []
-
                 data_dict["id"] = id_
 
                 yield id_, data_dict
 
         elif self.config.schema == "ct": # yield only raw clinical trial data
-            ct_files = sorted((data_dir / "training_data" / "CT json").glob("*.json"))
+            ct_files = sorted((data_dir / "CT json").glob("*.json"))
             for ct_file in ct_files:
                 with open(ct_file, "r") as f:
                     raw_data = json.load(f)
@@ -239,20 +234,20 @@ def _generate_examples(self, data_dir: Path, split: str, config) -> Tuple[int, D
                     yield id_, data_dict
 
         elif self.config.schema == "bigbio_TE": # combine labels and clinical trial text data here
-            with open(data_dir / f"training_data/{split}.json", "r") as f:
+            with open(data_dir / f"{split}.json", "r") as f:
                 raw_label_data = json.load(f)
 
             for id_ in sorted(raw_label_data):
                 primary_id = raw_label_data[id_]["Primary_id"]
                 secondary_id = raw_label_data[id_].get("Secondary_id")
 
-                with open(data_dir / f"training_data/CT json" / f"{primary_id}.json", "r") as f:
+                with open(data_dir / f"CT json" / f"{primary_id}.json", "r") as f:
                     raw_ct_data = json.load(f)
 
                 text_primary = _get_text(raw_ct_data, section=raw_label_data[id_]["Section_id"])
 
                 if secondary_id:
-                    with open(data_dir / f"training_data/CT json" / f"{secondary_id}.json", "r") as f:
+                    with open(data_dir / f"CT json" / f"{secondary_id}.json", "r") as f:
                         raw_ct_data = json.load(f)
                     text_secondary = _get_text(raw_ct_data, section=raw_label_data[id_]["Section_id"])
                 else: