From b9a52a66a83a920278e6039ecc48d2ee07443090 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 12 Jul 2024 17:27:03 -0700 Subject: [PATCH 1/2] Make sure 'None' gets read in as a string --- scripts/table_updates/update_data_table.py | 11 +++++++---- scripts/table_updates/utilities.py | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/table_updates/update_data_table.py b/scripts/table_updates/update_data_table.py index 343ed59a..53bada95 100644 --- a/scripts/table_updates/update_data_table.py +++ b/scripts/table_updates/update_data_table.py @@ -289,15 +289,18 @@ def main(): with open(project_config) as config_file: cohort_info = json.load(config_file) logger.info("Read cohort information successful.") - config_file.close() # get master table + # This is the internal tables with non redacted table_id, condition = list(TABLE_INFO[table_type]) master_table = download_synapse_table(syn, table_id, condition) + # This contains external tables with redacted TABLE_INFO["redacted"] = ('syn21446696',"table_type='data' and double_curated is false") - - # download data files - # TODO: find the cohort that has new data + + # download data files + # TODO: find the cohort that has new data + # This is a mapping to all the intake data. e.g: ProstateBPCIntake_data + # found here: https://www.synapse.org/Synapse:syn23286928 cohort_info_selected = cohort_info[table_type] cohort_data_list = [] for cohort in cohort_info_selected: diff --git a/scripts/table_updates/utilities.py b/scripts/table_updates/utilities.py index 5f90c745..dbe94d1a 100644 --- a/scripts/table_updates/utilities.py +++ b/scripts/table_updates/utilities.py @@ -69,7 +69,8 @@ def get_data(syn, label_data_id, cohort): Returns: Dataframe: label data """ - label_data = pandas.read_csv(syn.get(label_data_id).path,low_memory=False) + na_values = [" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"] + label_data = pandas.read_csv(syn.get(label_data_id).path, low_memory=False, na_values=na_values) label_data['cohort'] = cohort return(label_data) From 33f13d8ebe4d4eba192d0d1800be3c4ca07365aa Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 12 Jul 2024 17:42:32 -0700 Subject: [PATCH 2/2] Add --- scripts/table_updates/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/table_updates/utilities.py b/scripts/table_updates/utilities.py index dbe94d1a..078e1b2b 100644 --- a/scripts/table_updates/utilities.py +++ b/scripts/table_updates/utilities.py @@ -70,7 +70,7 @@ def get_data(syn, label_data_id, cohort): Dataframe: label data """ na_values = [" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "", "N/A", "NA", "NULL", "NaN", "n/a", "nan", "null"] - label_data = pandas.read_csv(syn.get(label_data_id).path, low_memory=False, na_values=na_values) + label_data = pandas.read_csv(syn.get(label_data_id).path, low_memory=False, na_values=na_values, keep_default_na=False) label_data['cohort'] = cohort return(label_data)