From b6fa640763320a6a43471d361f99b72b13238772 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Wed, 11 Sep 2024 20:02:20 +0000 Subject: [PATCH 01/11] use detailed columns to extract tier1a code --- .../export_bpc_selected_cases.R | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index ff6ac28f..5bece297 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o stop("Usage: Rscript export_bpc_selected_cases.R -h") } -in_file <- opt$input -out_folder <- opt$output -phase <- opt$phase -cohort <- opt$cohort -site <- opt$site -release <- opt$release +in_file <- 'syn62828306' +out_folder <- 'syn62828556' +phase <- 'phase 1' +cohort <- 'NSCLC' +site <- 'DFCI' +release <- '17.2-consortium' # check user input ----------------- @@ -84,7 +84,7 @@ print("get clinical data") sex_mapping <- synTableQuery("SELECT * FROM syn7434222")$asDataFrame() race_mapping <- synTableQuery("SELECT * FROM syn7434236")$asDataFrame() ethnicity_mapping <- synTableQuery("SELECT * FROM syn7434242")$asDataFrame() -# sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame() +sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame() # output setup phase_no_space <- sub(" ","_",sub(" ","",phase)) @@ -128,18 +128,18 @@ patient_output$redcap_repeat_instance <- rep("") patient_output$genie_patient_id <- patient_output$record_id patient_output$birth_year <- clinical$birth_year[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_ethnicity_code <- clinical$ethnicity[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_primary <- clinical$primary_race[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_secondary <- clinical$secondary_race[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_sex_code <- clinical$sex[match(patient_output$genie_patient_id, clinical$patient_id)] +patient_output$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] +patient_output$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] +patient_output$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] +patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] +patient_output$naaccr_sex_code <- clinical$sex_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] # mapping to code -patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$CBIO_LABEL)] -patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$CBIO_LABEL)] -patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$CBIO_LABEL)] -patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$CBIO_LABEL)] -patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$CBIO_LABEL)] +patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)] +patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$DESCRIPTION)] +patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$DESCRIPTION)] +patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)] +patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$DESCRIPTION)] print("recode") # recode # cannotReleaseHIPAA = NA @@ -165,6 +165,7 @@ sample_info_list <- lapply(samples_per_patient,function(x){ temp_df$cpt_genie_sample_id = x[i] temp_df$cpt_oncotree_code = clinical$oncotree_code[clinical$sample_id == x[i]] temp_df$cpt_sample_type = clinical$sample_type_detailed[clinical$sample_id == x[i]] + temp_df$cpt_sample_type = sample_type_mapping$CODE[match(temp_df$cpt_sample_type, sample_type_mapping$DESCRIPTION)] temp_df$cpt_seq_assay_id = clinical$seq_assay_id[clinical$sample_id == x[i]] temp_df$cpt_seq_date = clinical$seq_year[clinical$sample_id == x[i]] temp_df$age_at_seq_report = clinical$age_at_seq_report_days[clinical$sample_id == x[i]] From 4c6e4af628c317e84f21eb20b3df872da3dc2659 Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Wed, 11 Sep 2024 14:03:40 -0700 Subject: [PATCH 02/11] Update export_bpc_selected_cases.R revert changes in paramters --- scripts/case_selection/export_bpc_selected_cases.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index 5bece297..2972fc9a 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o stop("Usage: Rscript export_bpc_selected_cases.R -h") } -in_file <- 'syn62828306' -out_folder <- 'syn62828556' -phase <- 'phase 1' -cohort <- 'NSCLC' -site <- 'DFCI' -release <- '17.2-consortium' +in_file <- opt$input +out_folder <- opt$output +phase <- opt$phase +cohort <- opt$cohort +site <- opt$site +release <- opt$release # check user input ----------------- From a63b62fbecd29164d27941b4bb26bc8589de2b66 Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:01:55 -0700 Subject: [PATCH 03/11] Update export_bpc_selected_cases.R Update comment in recode section to add description for new values --- scripts/case_selection/export_bpc_selected_cases.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index 2972fc9a..9db3eabf 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -144,11 +144,11 @@ print("recode") # recode # cannotReleaseHIPAA = NA patient_output$birth_year[which(patient_output$birth_year == "cannotReleaseHIPAA")] <- NA -# -1 Not collected = 9 Unknown +# -1 Not collected = 9 Unknown whether Spanish or not patient_output$naaccr_ethnicity_code[which(patient_output$naaccr_ethnicity_code == -1)] <- 9 -# -1 Not collected = 99 Unknown +# -1 Not collected = 99 Unknown by patient patient_output$naaccr_race_code_primary[which(patient_output$naaccr_race_code_primary == -1)] <- 99 -# -1 Not collected = 88 according to NAACCR +# -1 Not collected = 88 No further race documented according to NAACCR patient_output$naaccr_race_code_secondary[which(patient_output$naaccr_race_code_secondary == -1)] <- 88 patient_output$naaccr_race_code_tertiary[which(patient_output$naaccr_race_code_tertiary == -1)] <- 88 From 7677a3b76ce715044d579f3c6cf2fa8040f4811d Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:09:50 -0700 Subject: [PATCH 04/11] Update export_bpc_selected_cases.R keep sample_type as descriptive values --- scripts/case_selection/export_bpc_selected_cases.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index 9db3eabf..15b2423f 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -84,7 +84,7 @@ print("get clinical data") sex_mapping <- synTableQuery("SELECT * FROM syn7434222")$asDataFrame() race_mapping <- synTableQuery("SELECT * FROM syn7434236")$asDataFrame() ethnicity_mapping <- synTableQuery("SELECT * FROM syn7434242")$asDataFrame() -sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame() +#sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame() # output setup phase_no_space <- sub(" ","_",sub(" ","",phase)) @@ -165,7 +165,6 @@ sample_info_list <- lapply(samples_per_patient,function(x){ temp_df$cpt_genie_sample_id = x[i] temp_df$cpt_oncotree_code = clinical$oncotree_code[clinical$sample_id == x[i]] temp_df$cpt_sample_type = clinical$sample_type_detailed[clinical$sample_id == x[i]] - temp_df$cpt_sample_type = sample_type_mapping$CODE[match(temp_df$cpt_sample_type, sample_type_mapping$DESCRIPTION)] temp_df$cpt_seq_assay_id = clinical$seq_assay_id[clinical$sample_id == x[i]] temp_df$cpt_seq_date = clinical$seq_year[clinical$sample_id == x[i]] temp_df$age_at_seq_report = clinical$age_at_seq_report_days[clinical$sample_id == x[i]] From 9821503f6dcd032c5acb9bb17d654712c15d371c Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 12 Sep 2024 22:07:09 +0000 Subject: [PATCH 05/11] group mapping logic into one function --- .../export_bpc_selected_cases.R | 61 +++++++++++-------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index 15b2423f..b3187c69 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o stop("Usage: Rscript export_bpc_selected_cases.R -h") } -in_file <- opt$input -out_folder <- opt$output -phase <- opt$phase -cohort <- opt$cohort -site <- opt$site -release <- opt$release +in_file <- 'syn62828306' +out_folder <- 'syn62828556' +phase <- 'phase 1' +cohort <- 'NSCLC' +site <- 'DFCI' +release <- '17.2-consortium' # check user input ----------------- @@ -120,26 +120,35 @@ existing_patients = selected_cases[selected_cases %in% clinical$patient_id] samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_cases] print("map data for each instrument") -# mapping data for each instrument -# instrument - patient_characteristics -patient_output <- data.frame("record_id" = existing_patients) -patient_output$redcap_repeat_instrument <- rep("") -patient_output$redcap_repeat_instance <- rep("") - -patient_output$genie_patient_id <- patient_output$record_id -patient_output$birth_year <- clinical$birth_year[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_sex_code <- clinical$sex_detailed[match(patient_output$genie_patient_id, clinical$patient_id)] - -# mapping to code -patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)] -patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$DESCRIPTION)] -patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$DESCRIPTION)] -patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)] -patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$DESCRIPTION)] +#' mapping data for instrument - patient_characteristics +#' +#' @param clinical A data frame of released clinical data for selected cases +#' @param existing_patients A data frame of available patient after case selection +#' @return A data frame with mapped code +remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){ + + patient_df <- data.frame("record_id" = existing_patients) + patient_df$redcap_repeat_instrument <- rep("") + patient_df$redcap_repeat_instance <- rep("") + + patient_df$genie_patient_id <- patient_df$record_id + patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + + # mapping to code + patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)] + patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)] +} + +patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping) + print("recode") # recode # cannotReleaseHIPAA = NA From 6417c2236d6db66294f9a4bd8148a71f0826d07e Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Thu, 12 Sep 2024 15:30:25 -0700 Subject: [PATCH 06/11] Update export_bpc_selected_cases.R --- scripts/case_selection/export_bpc_selected_cases.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index b3187c69..664c6e65 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o stop("Usage: Rscript export_bpc_selected_cases.R -h") } -in_file <- 'syn62828306' -out_folder <- 'syn62828556' -phase <- 'phase 1' -cohort <- 'NSCLC' -site <- 'DFCI' -release <- '17.2-consortium' +in_file <- opt$input +out_folder <- opt$output +phase <- opt$phase +cohort <- opt$cohort +site <- opt$site +release <- opt$release # check user input ----------------- From 73a852ab2f49efdc3e383b815024a699803eb95e Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 13 Sep 2024 00:19:17 +0000 Subject: [PATCH 07/11] add patient characteristics remap function to shared_fxns and add test to it --- scripts/case_selection/shared_fxns.R | 29 +++++++++++ .../case_selection/tests/test_shared_fxns.R | 51 +++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R index 2c1a581b..0d3ca642 100644 --- a/scripts/case_selection/shared_fxns.R +++ b/scripts/case_selection/shared_fxns.R @@ -255,4 +255,33 @@ get_main_genie_clinical_id <- function(release){ } } return(NULL) +} + +#' Mapping data for patient_characteristics +#' +#' @param clinical A data frame of released clinical data for selected cases +#' @param existing_patients A data frame of available patient after case selection +#' @return A data frame with mapped code +remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){ + + patient_df <- data.frame("record_id" = existing_patients) + patient_df$redcap_repeat_instrument <- rep("") + patient_df$redcap_repeat_instance <- rep("") + + patient_df$genie_patient_id <- patient_df$record_id + patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + + # mapping to code + patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)] + patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)] + + return(patient_df) } \ No newline at end of file diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R index 60aa7604..0db589ce 100644 --- a/scripts/case_selection/tests/test_shared_fxns.R +++ b/scripts/case_selection/tests/test_shared_fxns.R @@ -52,3 +52,54 @@ test_that("get_main_genie_clinical_id returns NULL when data_clinical.txt does n result <- get_main_genie_clinical_id(release) expect_null(result) }) + +test_that("remap_patient_characteristics works as expected", { + + # Mock input data + clinical <- data.frame( + patient_id = c(1, 2, 3), + birth_year = c(1980, 1990, 2000), + ethnicity_detailed = c("Hispanic", "Non-Hispanic", "Hispanic"), + primary_race_detailed = c("White", "Black", "Asian"), + secondary_race_detailed = c("Unknown", "White", "Black"), + tertiary_race_detailed = c("Asian", "Unknown", "White"), + sex_detailed = c("Male", "Female", "Male") + ) + + existing_patients <- c(1, 2, 3) + + ethnicity_mapping <- data.frame( + DESCRIPTION = c("Hispanic", "Non-Hispanic"), + CODE = c("1", "2") + ) + + race_mapping <- data.frame( + DESCRIPTION = c("White", "Black", "Asian", "Unknown"), + CODE = c("1", "2", "3", "99") + ) + + sex_mapping <- data.frame( + DESCRIPTION = c("Male", "Female"), + CODE = c("M", "F") + ) + + # Expected output + expected_output <- data.frame( + record_id = c(1, 2, 3), + redcap_repeat_instrument = c("", "", ""), + redcap_repeat_instance = c("", "", ""), + genie_patient_id = c(1, 2, 3), + birth_year = c(1980, 1990, 2000), + naaccr_ethnicity_code = c("1", "2", "1"), + naaccr_race_code_primary = c("1", "2", "3"), + naaccr_race_code_secondary = c("99", "1", "2"), + naaccr_race_code_tertiary = c("3", "99", "1"), + naaccr_sex_code = c("M", "F", "M") + ) + + # Run the function + result <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping) + + # Test if the output is as expected + expect_equal(result, expected_output) +}) From d5ab24fa4f7d0e217eaaa63dffc563e2a1805fb4 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 13 Sep 2024 03:35:10 +0000 Subject: [PATCH 08/11] relocate remap_patient_characteristics --- .../export_bpc_selected_cases.R | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index 664c6e65..23b530d2 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -120,33 +120,7 @@ existing_patients = selected_cases[selected_cases %in% clinical$patient_id] samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_cases] print("map data for each instrument") -#' mapping data for instrument - patient_characteristics -#' -#' @param clinical A data frame of released clinical data for selected cases -#' @param existing_patients A data frame of available patient after case selection -#' @return A data frame with mapped code -remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){ - - patient_df <- data.frame("record_id" = existing_patients) - patient_df$redcap_repeat_instrument <- rep("") - patient_df$redcap_repeat_instance <- rep("") - - patient_df$genie_patient_id <- patient_df$record_id - patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)] - patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] - patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] - patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] - patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] - patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] - - # mapping to code - patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)] - patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)] - patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)] - patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)] - patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)] -} - +# instrument - patient_characteristics patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping) print("recode") From 4919998bcda7d6a889de0394fad0f4b9743a02c4 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 13 Sep 2024 20:03:17 +0000 Subject: [PATCH 09/11] throw warning instead in check_for_missing_values --- .../export_bpc_selected_cases.R | 5 +++ scripts/case_selection/shared_fxns.R | 22 +++++++++++- .../case_selection/tests/test_shared_fxns.R | 35 +++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index 23b530d2..faab50dd 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -122,6 +122,11 @@ samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_case print("map data for each instrument") # instrument - patient_characteristics patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping) +# check missing values +# get naaccr code columns +naaccr_col <- grep("naaccr", colnames(patient_output), value = TRUE) +# error out if NAs or empty strings are detected in naaccr code columns +check_for_missing_values(patient_output, naaccr_col) print("recode") # recode diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R index 0d3ca642..e1e49f6c 100644 --- a/scripts/case_selection/shared_fxns.R +++ b/scripts/case_selection/shared_fxns.R @@ -261,6 +261,9 @@ get_main_genie_clinical_id <- function(release){ #' #' @param clinical A data frame of released clinical data for selected cases #' @param existing_patients A data frame of available patient after case selection +#' @param ethnicity_mapping The NAACCR_ETHNICITY_MAPPING data frame +#' @param race_mapping The NAACCR_RACE_MAPPING data frame +#' @param sex_mapping The NAACCR_SEX_MAPPING data frame #' @return A data frame with mapped code remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){ @@ -284,4 +287,21 @@ remap_patient_characteristics <- function(clinical, existing_patients, ethnicity patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)] return(patient_df) -} \ No newline at end of file +} + +#' Check for missing values +#' +#' @param data The data frame to check against +#' @param columns The target columns +check_for_missing_values <- function(data, columns) { + # Check for NA values or empty strings + missingness_col <- c() + for (col in columns) { + if (any(is.na(data[[col]]) | data[[col]] == "" )){ + missingness_col <- c(col, missingness_col) + } + } + if (length(missingness_col) > 0) { + warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", "))) + } +} diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R index 0db589ce..dbde090f 100644 --- a/scripts/case_selection/tests/test_shared_fxns.R +++ b/scripts/case_selection/tests/test_shared_fxns.R @@ -103,3 +103,38 @@ test_that("remap_patient_characteristics works as expected", { # Test if the output is as expected expect_equal(result, expected_output) }) + +test_that("check_for_missing_values - no missing or empty values in the data", { + data <- data.frame( + col1 = c(1, 2, 3), + col2 = c("a", "b", "c") + ) + expect_warning(check_for_missing_values(data, c("col1", "col2")), NA) +}) + +test_that("check_for_missing_values - NAs are detected", { + data <- data.frame( + col1 = c(1, NA, 3), + col2 = c("a", "b", "c") + ) + expect_warning(check_for_missing_values(data, c("col1", "col2")), + "Warning: Missing or empty values found in column\\(s\\): col1") +}) + +test_that("check_for_missing_values - empty string values are detected", { + data <- data.frame( + col1 = c(1, 2, 3), + col2 = c("a", "", "c") + ) + expect_warning(check_for_missing_values(data, c("col1", "col2")), + "Warning: Missing or empty values found in column\\(s\\): col2") +}) + +test_that("check_for_missing_values - multiple missing and empty values are detected", { + data <- data.frame( + col1 = c(1, NA, 3), + col2 = c("a", "", "c") + ) + expect_warning(check_for_missing_values(data, c("col1", "col2")), + "Warning: Missing or empty values found in column\\(s\\): col2, col1") +}) \ No newline at end of file From 65f371dda6c198e32a674bcfc5729c5cce628e99 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Fri, 13 Sep 2024 22:14:12 +0000 Subject: [PATCH 10/11] filter out CHOP, PROV, JHU rows when checking missingness --- scripts/case_selection/shared_fxns.R | 4 ++- .../case_selection/tests/test_shared_fxns.R | 34 +++++++++++++------ 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R index e1e49f6c..08688c9c 100644 --- a/scripts/case_selection/shared_fxns.R +++ b/scripts/case_selection/shared_fxns.R @@ -289,11 +289,13 @@ remap_patient_characteristics <- function(clinical, existing_patients, ethnicity return(patient_df) } -#' Check for missing values +#' Check for missing values in naaccr columns #' #' @param data The data frame to check against #' @param columns The target columns check_for_missing_values <- function(data, columns) { + # filter out CHOP, PROV, JHU centers with known NAs + data <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ] # Check for NA values or empty strings missingness_col <- c() for (col in columns) { diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R index dbde090f..93e5ecf7 100644 --- a/scripts/case_selection/tests/test_shared_fxns.R +++ b/scripts/case_selection/tests/test_shared_fxns.R @@ -104,37 +104,51 @@ test_that("remap_patient_characteristics works as expected", { expect_equal(result, expected_output) }) -test_that("check_for_missing_values - no missing or empty values in the data", { +test_that("check_for_missing_values - no missing or empty values in centers other than CHOP, PROV, JHU", { data <- data.frame( - col1 = c(1, 2, 3), - col2 = c("a", "b", "c") + col1 = c(1, 2, 3, NA), + col2 = c("a", "b", "c", ""), + genie_patient_id = c('a', 'b', 'c', 'CHOP123') ) expect_warning(check_for_missing_values(data, c("col1", "col2")), NA) + }) -test_that("check_for_missing_values - NAs are detected", { +test_that("check_for_missing_values - NAs are detected in centers other than CHOP, PROV, JHU", { data <- data.frame( col1 = c(1, NA, 3), - col2 = c("a", "b", "c") + col2 = c("a", "b", "c"), + genie_patient_id = c('CHOP123', 'b', 'PROV234') ) expect_warning(check_for_missing_values(data, c("col1", "col2")), "Warning: Missing or empty values found in column\\(s\\): col1") }) -test_that("check_for_missing_values - empty string values are detected", { +test_that("check_for_missing_values - empty string values are detected in centers other than CHOP, PROV, JHU", { data <- data.frame( col1 = c(1, 2, 3), - col2 = c("a", "", "c") + col2 = c("a", "", "c"), + genie_patient_id = c('CHOP123', 'b', 'PROV234') ) expect_warning(check_for_missing_values(data, c("col1", "col2")), "Warning: Missing or empty values found in column\\(s\\): col2") }) -test_that("check_for_missing_values - multiple missing and empty values are detected", { +test_that("check_for_missing_values - multiple missing and empty values are detected in centers other than CHOP, PROV, JHU", { data <- data.frame( - col1 = c(1, NA, 3), - col2 = c("a", "", "c") + col1 = c(1, NA, ""), + col2 = c("a", "", "c"), + genie_patient_id = c('CHOP123', 'b', 'PROV234') ) expect_warning(check_for_missing_values(data, c("col1", "col2")), "Warning: Missing or empty values found in column\\(s\\): col2, col1") +}) + +test_that("check_for_missing_values - multiple missing and empty values are detected in CHOP, PROV, JHU centers", { + data <- data.frame( + col1 = c(1, NA, 2), + col2 = c("a", "", "c"), + genie_patient_id = c('a', 'CHOP123', 'PROV234') + ) + expect_warning(check_for_missing_values(data, c("col1", "col2")), NA) }) \ No newline at end of file From 577b2995244c3e4b80d75032d269aaafb03b3f8d Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 16 Sep 2024 17:30:29 +0000 Subject: [PATCH 11/11] CHOP|PROV|JHU only if checking for missiness in NAACCR code columns --- scripts/case_selection/shared_fxns.R | 12 +++-- .../case_selection/tests/test_shared_fxns.R | 51 +++++++------------ 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R index 08688c9c..c70e9d5f 100644 --- a/scripts/case_selection/shared_fxns.R +++ b/scripts/case_selection/shared_fxns.R @@ -294,15 +294,19 @@ remap_patient_characteristics <- function(clinical, existing_patients, ethnicity #' @param data The data frame to check against #' @param columns The target columns check_for_missing_values <- function(data, columns) { - # filter out CHOP, PROV, JHU centers with known NAs - data <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ] # Check for NA values or empty strings missingness_col <- c() for (col in columns) { - if (any(is.na(data[[col]]) | data[[col]] == "" )){ + if (col %in% c("naaccr_race_code_tertiary", "naaccr_race_code_secondary")) { + # filter out CHOP, PROV, JHU centers with known NAs in NAACCR code columns + relevant_rows <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ] + } else{ + relevant_rows <- data + } + if (any(is.na(relevant_rows[[col]]) | relevant_rows[[col]] == "" )){ missingness_col <- c(col, missingness_col) } - } + } if (length(missingness_col) > 0) { warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", "))) } diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R index 93e5ecf7..fdd7fdb3 100644 --- a/scripts/case_selection/tests/test_shared_fxns.R +++ b/scripts/case_selection/tests/test_shared_fxns.R @@ -104,51 +104,38 @@ test_that("remap_patient_characteristics works as expected", { expect_equal(result, expected_output) }) -test_that("check_for_missing_values - no missing or empty values in centers other than CHOP, PROV, JHU", { +test_that("check_for_missing_values - no missing or empty values", { data <- data.frame( - col1 = c(1, 2, 3, NA), - col2 = c("a", "b", "c", ""), - genie_patient_id = c('a', 'b', 'c', 'CHOP123') + col1 = c(1, 2, 3), + col2 = c("a", "b", "c"), + genie_patient_id = c("a", "b", "CHOP123"), + naaccr_race_code_tertiary = c("a", "b", "c"), + naaccr_race_code_secondary = c("a", "b", "c") ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), NA) + expect_no_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary"))) }) -test_that("check_for_missing_values - NAs are detected in centers other than CHOP, PROV, JHU", { +test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in centers other than CHOP, PROV, JHU", { data <- data.frame( - col1 = c(1, NA, 3), + col1 = c(1, NA, ""), col2 = c("a", "b", "c"), - genie_patient_id = c('CHOP123', 'b', 'PROV234') + genie_patient_id = c("CHOP123", "b", "PROV234"), + naaccr_race_code_tertiary = c("a", "", "c"), + naaccr_race_code_secondary = c("a", "b", "c") ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), - "Warning: Missing or empty values found in column\\(s\\): col1") + expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), + "Warning: Missing or empty values found in column\\(s\\): naaccr_race_code_tertiary, col1") }) -test_that("check_for_missing_values - empty string values are detected in centers other than CHOP, PROV, JHU", { - data <- data.frame( - col1 = c(1, 2, 3), - col2 = c("a", "", "c"), - genie_patient_id = c('CHOP123', 'b', 'PROV234') - ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), - "Warning: Missing or empty values found in column\\(s\\): col2") -}) - -test_that("check_for_missing_values - multiple missing and empty values are detected in centers other than CHOP, PROV, JHU", { +test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in CHOP, PROV, JHU centers", { data <- data.frame( col1 = c(1, NA, ""), col2 = c("a", "", "c"), - genie_patient_id = c('CHOP123', 'b', 'PROV234') + genie_patient_id = c("CHOP123", "b", "PROV234"), + naaccr_race_code_tertiary = c("", "b", "c"), + naaccr_race_code_secondary = c("a", "b", NA) ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), + expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), "Warning: Missing or empty values found in column\\(s\\): col2, col1") }) - -test_that("check_for_missing_values - multiple missing and empty values are detected in CHOP, PROV, JHU centers", { - data <- data.frame( - col1 = c(1, NA, 2), - col2 = c("a", "", "c"), - genie_patient_id = c('a', 'CHOP123', 'PROV234') - ) - expect_warning(check_for_missing_values(data, c("col1", "col2")), NA) -}) \ No newline at end of file