Skip to content

Commit

Permalink
Merge pull request #152 from Sage-Bionetworks/GEN-1476-use-tier1a-code
Browse files Browse the repository at this point in the history
[GEN-1476] use detailed columns to extract tier1a code
  • Loading branch information
danlu1 authored Sep 16, 2024
2 parents eb1dc27 + 577b299 commit 4df1dca
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 24 deletions.
34 changes: 11 additions & 23 deletions scripts/case_selection/export_bpc_selected_cases.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ print("get clinical data")
sex_mapping <- synTableQuery("SELECT * FROM syn7434222")$asDataFrame()
race_mapping <- synTableQuery("SELECT * FROM syn7434236")$asDataFrame()
ethnicity_mapping <- synTableQuery("SELECT * FROM syn7434242")$asDataFrame()
# sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()
#sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()

# output setup
phase_no_space <- sub(" ","_",sub(" ","",phase))
Expand Down Expand Up @@ -120,35 +120,23 @@ existing_patients = selected_cases[selected_cases %in% clinical$patient_id]
samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_cases]

print("map data for each instrument")
# mapping data for each instrument
# instrument - patient_characteristics
patient_output <- data.frame("record_id" = existing_patients)
patient_output$redcap_repeat_instrument <- rep("")
patient_output$redcap_repeat_instance <- rep("")

patient_output$genie_patient_id <- patient_output$record_id
patient_output$birth_year <- clinical$birth_year[match(patient_output$genie_patient_id, clinical$patient_id)]
patient_output$naaccr_ethnicity_code <- clinical$ethnicity[match(patient_output$genie_patient_id, clinical$patient_id)]
patient_output$naaccr_race_code_primary <- clinical$primary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
patient_output$naaccr_race_code_secondary <- clinical$secondary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
patient_output$naaccr_sex_code <- clinical$sex[match(patient_output$genie_patient_id, clinical$patient_id)]

# mapping to code
patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$CBIO_LABEL)]
patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$CBIO_LABEL)]
patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$CBIO_LABEL)]
patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$CBIO_LABEL)]
patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$CBIO_LABEL)]
patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)
# check missing values
# get naaccr code columns
naaccr_col <- grep("naaccr", colnames(patient_output), value = TRUE)
# error out if NAs or empty strings are detected in naaccr code columns
check_for_missing_values(patient_output, naaccr_col)

print("recode")
# recode
# cannotReleaseHIPAA = NA
patient_output$birth_year[which(patient_output$birth_year == "cannotReleaseHIPAA")] <- NA
# -1 Not collected = 9 Unknown
# -1 Not collected = 9 Unknown whether Spanish or not
patient_output$naaccr_ethnicity_code[which(patient_output$naaccr_ethnicity_code == -1)] <- 9
# -1 Not collected = 99 Unknown
# -1 Not collected = 99 Unknown by patient
patient_output$naaccr_race_code_primary[which(patient_output$naaccr_race_code_primary == -1)] <- 99
# -1 Not collected = 88 according to NAACCR
# -1 Not collected = 88 No further race documented according to NAACCR
patient_output$naaccr_race_code_secondary[which(patient_output$naaccr_race_code_secondary == -1)] <- 88
patient_output$naaccr_race_code_tertiary[which(patient_output$naaccr_race_code_tertiary == -1)] <- 88

Expand Down
57 changes: 56 additions & 1 deletion scripts/case_selection/shared_fxns.R
Original file line number Diff line number Diff line change
Expand Up @@ -255,4 +255,59 @@ get_main_genie_clinical_id <- function(release){
}
}
return(NULL)
}
}

#' Mapping data for patient_characteristics
#'
#' @param clinical A data frame of released clinical data for selected cases
#' @param existing_patients A data frame of available patient after case selection
#' @param ethnicity_mapping The NAACCR_ETHNICITY_MAPPING data frame
#' @param race_mapping The NAACCR_RACE_MAPPING data frame
#' @param sex_mapping The NAACCR_SEX_MAPPING data frame
#' @return A data frame with mapped code
remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){

patient_df <- data.frame("record_id" = existing_patients)
patient_df$redcap_repeat_instrument <- rep("")
patient_df$redcap_repeat_instance <- rep("")

patient_df$genie_patient_id <- patient_df$record_id
patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)]
patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]

# mapping to code
patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)]
patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)]
patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)]
patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)]
patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)]

return(patient_df)
}

#' Check for missing values in naaccr columns
#'
#' @param data The data frame to check against
#' @param columns The target columns
check_for_missing_values <- function(data, columns) {
# Check for NA values or empty strings
missingness_col <- c()
for (col in columns) {
if (col %in% c("naaccr_race_code_tertiary", "naaccr_race_code_secondary")) {
# filter out CHOP, PROV, JHU centers with known NAs in NAACCR code columns
relevant_rows <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ]
} else{
relevant_rows <- data
}
if (any(is.na(relevant_rows[[col]]) | relevant_rows[[col]] == "" )){
missingness_col <- c(col, missingness_col)
}
}
if (length(missingness_col) > 0) {
warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", ")))
}
}
87 changes: 87 additions & 0 deletions scripts/case_selection/tests/test_shared_fxns.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,90 @@ test_that("get_main_genie_clinical_id returns NULL when data_clinical.txt does n
result <- get_main_genie_clinical_id(release)
expect_null(result)
})

test_that("remap_patient_characteristics works as expected", {

# Mock input data
clinical <- data.frame(
patient_id = c(1, 2, 3),
birth_year = c(1980, 1990, 2000),
ethnicity_detailed = c("Hispanic", "Non-Hispanic", "Hispanic"),
primary_race_detailed = c("White", "Black", "Asian"),
secondary_race_detailed = c("Unknown", "White", "Black"),
tertiary_race_detailed = c("Asian", "Unknown", "White"),
sex_detailed = c("Male", "Female", "Male")
)

existing_patients <- c(1, 2, 3)

ethnicity_mapping <- data.frame(
DESCRIPTION = c("Hispanic", "Non-Hispanic"),
CODE = c("1", "2")
)

race_mapping <- data.frame(
DESCRIPTION = c("White", "Black", "Asian", "Unknown"),
CODE = c("1", "2", "3", "99")
)

sex_mapping <- data.frame(
DESCRIPTION = c("Male", "Female"),
CODE = c("M", "F")
)

# Expected output
expected_output <- data.frame(
record_id = c(1, 2, 3),
redcap_repeat_instrument = c("", "", ""),
redcap_repeat_instance = c("", "", ""),
genie_patient_id = c(1, 2, 3),
birth_year = c(1980, 1990, 2000),
naaccr_ethnicity_code = c("1", "2", "1"),
naaccr_race_code_primary = c("1", "2", "3"),
naaccr_race_code_secondary = c("99", "1", "2"),
naaccr_race_code_tertiary = c("3", "99", "1"),
naaccr_sex_code = c("M", "F", "M")
)

# Run the function
result <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)

# Test if the output is as expected
expect_equal(result, expected_output)
})

test_that("check_for_missing_values - no missing or empty values", {
data <- data.frame(
col1 = c(1, 2, 3),
col2 = c("a", "b", "c"),
genie_patient_id = c("a", "b", "CHOP123"),
naaccr_race_code_tertiary = c("a", "b", "c"),
naaccr_race_code_secondary = c("a", "b", "c")
)
expect_no_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")))

})

test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in centers other than CHOP, PROV, JHU", {
data <- data.frame(
col1 = c(1, NA, ""),
col2 = c("a", "b", "c"),
genie_patient_id = c("CHOP123", "b", "PROV234"),
naaccr_race_code_tertiary = c("a", "", "c"),
naaccr_race_code_secondary = c("a", "b", "c")
)
expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")),
"Warning: Missing or empty values found in column\\(s\\): naaccr_race_code_tertiary, col1")
})

test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in CHOP, PROV, JHU centers", {
data <- data.frame(
col1 = c(1, NA, ""),
col2 = c("a", "", "c"),
genie_patient_id = c("CHOP123", "b", "PROV234"),
naaccr_race_code_tertiary = c("", "b", "c"),
naaccr_race_code_secondary = c("a", "b", NA)
)
expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")),
"Warning: Missing or empty values found in column\\(s\\): col2, col1")
})

0 comments on commit 4df1dca

Please sign in to comment.