diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R index ff6ac28f..faab50dd 100644 --- a/scripts/case_selection/export_bpc_selected_cases.R +++ b/scripts/case_selection/export_bpc_selected_cases.R @@ -84,7 +84,7 @@ print("get clinical data") sex_mapping <- synTableQuery("SELECT * FROM syn7434222")$asDataFrame() race_mapping <- synTableQuery("SELECT * FROM syn7434236")$asDataFrame() ethnicity_mapping <- synTableQuery("SELECT * FROM syn7434242")$asDataFrame() -# sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame() +#sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame() # output setup phase_no_space <- sub(" ","_",sub(" ","",phase)) @@ -120,35 +120,23 @@ existing_patients = selected_cases[selected_cases %in% clinical$patient_id] samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_cases] print("map data for each instrument") -# mapping data for each instrument # instrument - patient_characteristics -patient_output <- data.frame("record_id" = existing_patients) -patient_output$redcap_repeat_instrument <- rep("") -patient_output$redcap_repeat_instance <- rep("") - -patient_output$genie_patient_id <- patient_output$record_id -patient_output$birth_year <- clinical$birth_year[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_ethnicity_code <- clinical$ethnicity[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_primary <- clinical$primary_race[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_secondary <- clinical$secondary_race[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race[match(patient_output$genie_patient_id, clinical$patient_id)] -patient_output$naaccr_sex_code <- clinical$sex[match(patient_output$genie_patient_id, clinical$patient_id)] - -# mapping to code -patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$CBIO_LABEL)] -patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$CBIO_LABEL)] -patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$CBIO_LABEL)] -patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$CBIO_LABEL)] -patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$CBIO_LABEL)] +patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping) +# check missing values +# get naaccr code columns +naaccr_col <- grep("naaccr", colnames(patient_output), value = TRUE) +# error out if NAs or empty strings are detected in naaccr code columns +check_for_missing_values(patient_output, naaccr_col) + print("recode") # recode # cannotReleaseHIPAA = NA patient_output$birth_year[which(patient_output$birth_year == "cannotReleaseHIPAA")] <- NA -# -1 Not collected = 9 Unknown +# -1 Not collected = 9 Unknown whether Spanish or not patient_output$naaccr_ethnicity_code[which(patient_output$naaccr_ethnicity_code == -1)] <- 9 -# -1 Not collected = 99 Unknown +# -1 Not collected = 99 Unknown by patient patient_output$naaccr_race_code_primary[which(patient_output$naaccr_race_code_primary == -1)] <- 99 -# -1 Not collected = 88 according to NAACCR +# -1 Not collected = 88 No further race documented according to NAACCR patient_output$naaccr_race_code_secondary[which(patient_output$naaccr_race_code_secondary == -1)] <- 88 patient_output$naaccr_race_code_tertiary[which(patient_output$naaccr_race_code_tertiary == -1)] <- 88 diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R index 2c1a581b..c70e9d5f 100644 --- a/scripts/case_selection/shared_fxns.R +++ b/scripts/case_selection/shared_fxns.R @@ -255,4 +255,59 @@ get_main_genie_clinical_id <- function(release){ } } return(NULL) -} \ No newline at end of file +} + +#' Mapping data for patient_characteristics +#' +#' @param clinical A data frame of released clinical data for selected cases +#' @param existing_patients A data frame of available patient after case selection +#' @param ethnicity_mapping The NAACCR_ETHNICITY_MAPPING data frame +#' @param race_mapping The NAACCR_RACE_MAPPING data frame +#' @param sex_mapping The NAACCR_SEX_MAPPING data frame +#' @return A data frame with mapped code +remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){ + + patient_df <- data.frame("record_id" = existing_patients) + patient_df$redcap_repeat_instrument <- rep("") + patient_df$redcap_repeat_instance <- rep("") + + patient_df$genie_patient_id <- patient_df$record_id + patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)] + + # mapping to code + patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)] + patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)] + patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)] + + return(patient_df) +} + +#' Check for missing values in naaccr columns +#' +#' @param data The data frame to check against +#' @param columns The target columns +check_for_missing_values <- function(data, columns) { + # Check for NA values or empty strings + missingness_col <- c() + for (col in columns) { + if (col %in% c("naaccr_race_code_tertiary", "naaccr_race_code_secondary")) { + # filter out CHOP, PROV, JHU centers with known NAs in NAACCR code columns + relevant_rows <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ] + } else{ + relevant_rows <- data + } + if (any(is.na(relevant_rows[[col]]) | relevant_rows[[col]] == "" )){ + missingness_col <- c(col, missingness_col) + } + } + if (length(missingness_col) > 0) { + warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", "))) + } +} diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R index 60aa7604..fdd7fdb3 100644 --- a/scripts/case_selection/tests/test_shared_fxns.R +++ b/scripts/case_selection/tests/test_shared_fxns.R @@ -52,3 +52,90 @@ test_that("get_main_genie_clinical_id returns NULL when data_clinical.txt does n result <- get_main_genie_clinical_id(release) expect_null(result) }) + +test_that("remap_patient_characteristics works as expected", { + + # Mock input data + clinical <- data.frame( + patient_id = c(1, 2, 3), + birth_year = c(1980, 1990, 2000), + ethnicity_detailed = c("Hispanic", "Non-Hispanic", "Hispanic"), + primary_race_detailed = c("White", "Black", "Asian"), + secondary_race_detailed = c("Unknown", "White", "Black"), + tertiary_race_detailed = c("Asian", "Unknown", "White"), + sex_detailed = c("Male", "Female", "Male") + ) + + existing_patients <- c(1, 2, 3) + + ethnicity_mapping <- data.frame( + DESCRIPTION = c("Hispanic", "Non-Hispanic"), + CODE = c("1", "2") + ) + + race_mapping <- data.frame( + DESCRIPTION = c("White", "Black", "Asian", "Unknown"), + CODE = c("1", "2", "3", "99") + ) + + sex_mapping <- data.frame( + DESCRIPTION = c("Male", "Female"), + CODE = c("M", "F") + ) + + # Expected output + expected_output <- data.frame( + record_id = c(1, 2, 3), + redcap_repeat_instrument = c("", "", ""), + redcap_repeat_instance = c("", "", ""), + genie_patient_id = c(1, 2, 3), + birth_year = c(1980, 1990, 2000), + naaccr_ethnicity_code = c("1", "2", "1"), + naaccr_race_code_primary = c("1", "2", "3"), + naaccr_race_code_secondary = c("99", "1", "2"), + naaccr_race_code_tertiary = c("3", "99", "1"), + naaccr_sex_code = c("M", "F", "M") + ) + + # Run the function + result <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping) + + # Test if the output is as expected + expect_equal(result, expected_output) +}) + +test_that("check_for_missing_values - no missing or empty values", { + data <- data.frame( + col1 = c(1, 2, 3), + col2 = c("a", "b", "c"), + genie_patient_id = c("a", "b", "CHOP123"), + naaccr_race_code_tertiary = c("a", "b", "c"), + naaccr_race_code_secondary = c("a", "b", "c") + ) + expect_no_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary"))) + +}) + +test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in centers other than CHOP, PROV, JHU", { + data <- data.frame( + col1 = c(1, NA, ""), + col2 = c("a", "b", "c"), + genie_patient_id = c("CHOP123", "b", "PROV234"), + naaccr_race_code_tertiary = c("a", "", "c"), + naaccr_race_code_secondary = c("a", "b", "c") + ) + expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), + "Warning: Missing or empty values found in column\\(s\\): naaccr_race_code_tertiary, col1") +}) + +test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in CHOP, PROV, JHU centers", { + data <- data.frame( + col1 = c(1, NA, ""), + col2 = c("a", "", "c"), + genie_patient_id = c("CHOP123", "b", "PROV234"), + naaccr_race_code_tertiary = c("", "b", "c"), + naaccr_race_code_secondary = c("a", "b", NA) + ) + expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), + "Warning: Missing or empty values found in column\\(s\\): col2, col1") +})