From b6fa640763320a6a43471d361f99b72b13238772 Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Wed, 11 Sep 2024 20:02:20 +0000
Subject: [PATCH 01/11] use detailed columns to extract tier1a code

---
 .../export_bpc_selected_cases.R               | 35 ++++++++++---------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index ff6ac28f..5bece297 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o
   stop("Usage: Rscript export_bpc_selected_cases.R -h")
 }
 
-in_file <- opt$input
-out_folder <- opt$output
-phase <- opt$phase
-cohort <- opt$cohort
-site <- opt$site
-release <- opt$release
+in_file <- 'syn62828306'
+out_folder <- 'syn62828556'
+phase <- 'phase 1'
+cohort <- 'NSCLC'
+site <- 'DFCI'
+release <- '17.2-consortium'
 
 # check user input -----------------
 
@@ -84,7 +84,7 @@ print("get clinical data")
 sex_mapping <- synTableQuery("SELECT * FROM syn7434222")$asDataFrame()
 race_mapping <- synTableQuery("SELECT * FROM syn7434236")$asDataFrame()
 ethnicity_mapping <- synTableQuery("SELECT * FROM syn7434242")$asDataFrame()
-# sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()
+sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()
 
 # output setup
 phase_no_space <- sub(" ","_",sub(" ","",phase))
@@ -128,18 +128,18 @@ patient_output$redcap_repeat_instance <- rep("")
 
 patient_output$genie_patient_id <- patient_output$record_id
 patient_output$birth_year <- clinical$birth_year[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_ethnicity_code <- clinical$ethnicity[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_race_code_primary <- clinical$primary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_race_code_secondary <- clinical$secondary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_sex_code <- clinical$sex[match(patient_output$genie_patient_id, clinical$patient_id)]
+patient_output$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
+patient_output$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
+patient_output$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
+patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
+patient_output$naaccr_sex_code <- clinical$sex_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
 
 # mapping to code
-patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$CBIO_LABEL)]
-patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$CBIO_LABEL)]
-patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$CBIO_LABEL)]
-patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$CBIO_LABEL)]
-patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$CBIO_LABEL)]
+patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)]
+patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$DESCRIPTION)]
+patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$DESCRIPTION)]
+patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)]
+patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$DESCRIPTION)]
 print("recode")
 # recode
 # cannotReleaseHIPAA = NA
@@ -165,6 +165,7 @@ sample_info_list <- lapply(samples_per_patient,function(x){
     temp_df$cpt_genie_sample_id = x[i]
     temp_df$cpt_oncotree_code = clinical$oncotree_code[clinical$sample_id == x[i]]
     temp_df$cpt_sample_type = clinical$sample_type_detailed[clinical$sample_id == x[i]]
+    temp_df$cpt_sample_type = sample_type_mapping$CODE[match(temp_df$cpt_sample_type, sample_type_mapping$DESCRIPTION)]
     temp_df$cpt_seq_assay_id = clinical$seq_assay_id[clinical$sample_id == x[i]]
     temp_df$cpt_seq_date = clinical$seq_year[clinical$sample_id == x[i]]
     temp_df$age_at_seq_report = clinical$age_at_seq_report_days[clinical$sample_id == x[i]]

From 4c6e4af628c317e84f21eb20b3df872da3dc2659 Mon Sep 17 00:00:00 2001
From: Dan Lu <90745557+danlu1@users.noreply.github.com>
Date: Wed, 11 Sep 2024 14:03:40 -0700
Subject: [PATCH 02/11] Update export_bpc_selected_cases.R

revert changes in paramters
---
 scripts/case_selection/export_bpc_selected_cases.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index 5bece297..2972fc9a 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o
   stop("Usage: Rscript export_bpc_selected_cases.R -h")
 }
 
-in_file <- 'syn62828306'
-out_folder <- 'syn62828556'
-phase <- 'phase 1'
-cohort <- 'NSCLC'
-site <- 'DFCI'
-release <- '17.2-consortium'
+in_file <- opt$input
+out_folder <- opt$output
+phase <- opt$phase
+cohort <- opt$cohort
+site <- opt$site
+release <- opt$release
 
 # check user input -----------------
 

From a63b62fbecd29164d27941b4bb26bc8589de2b66 Mon Sep 17 00:00:00 2001
From: Dan Lu <90745557+danlu1@users.noreply.github.com>
Date: Wed, 11 Sep 2024 16:01:55 -0700
Subject: [PATCH 03/11] Update export_bpc_selected_cases.R

Update comment in recode section to add description for new values
---
 scripts/case_selection/export_bpc_selected_cases.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index 2972fc9a..9db3eabf 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -144,11 +144,11 @@ print("recode")
 # recode
 # cannotReleaseHIPAA = NA
 patient_output$birth_year[which(patient_output$birth_year == "cannotReleaseHIPAA")] <- NA
-# -1 Not collected = 9 Unknown
+# -1 Not collected = 9 Unknown whether Spanish or not
 patient_output$naaccr_ethnicity_code[which(patient_output$naaccr_ethnicity_code == -1)] <- 9
-# -1 Not collected = 99 Unknown
+# -1 Not collected = 99 Unknown by patient
 patient_output$naaccr_race_code_primary[which(patient_output$naaccr_race_code_primary == -1)] <- 99
-# -1 Not collected = 88 according to NAACCR
+# -1 Not collected = 88 No further race documented according to NAACCR
 patient_output$naaccr_race_code_secondary[which(patient_output$naaccr_race_code_secondary == -1)] <- 88
 patient_output$naaccr_race_code_tertiary[which(patient_output$naaccr_race_code_tertiary == -1)] <- 88
 

From 7677a3b76ce715044d579f3c6cf2fa8040f4811d Mon Sep 17 00:00:00 2001
From: Dan Lu <90745557+danlu1@users.noreply.github.com>
Date: Wed, 11 Sep 2024 16:09:50 -0700
Subject: [PATCH 04/11] Update export_bpc_selected_cases.R

keep sample_type as descriptive values
---
 scripts/case_selection/export_bpc_selected_cases.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index 9db3eabf..15b2423f 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -84,7 +84,7 @@ print("get clinical data")
 sex_mapping <- synTableQuery("SELECT * FROM syn7434222")$asDataFrame()
 race_mapping <- synTableQuery("SELECT * FROM syn7434236")$asDataFrame()
 ethnicity_mapping <- synTableQuery("SELECT * FROM syn7434242")$asDataFrame()
-sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()
+#sample_type_mapping <- synTableQuery("SELECT * FROM syn7434273")$asDataFrame()
 
 # output setup
 phase_no_space <- sub(" ","_",sub(" ","",phase))
@@ -165,7 +165,6 @@ sample_info_list <- lapply(samples_per_patient,function(x){
     temp_df$cpt_genie_sample_id = x[i]
     temp_df$cpt_oncotree_code = clinical$oncotree_code[clinical$sample_id == x[i]]
     temp_df$cpt_sample_type = clinical$sample_type_detailed[clinical$sample_id == x[i]]
-    temp_df$cpt_sample_type = sample_type_mapping$CODE[match(temp_df$cpt_sample_type, sample_type_mapping$DESCRIPTION)]
     temp_df$cpt_seq_assay_id = clinical$seq_assay_id[clinical$sample_id == x[i]]
     temp_df$cpt_seq_date = clinical$seq_year[clinical$sample_id == x[i]]
     temp_df$age_at_seq_report = clinical$age_at_seq_report_days[clinical$sample_id == x[i]]

From 9821503f6dcd032c5acb9bb17d654712c15d371c Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Thu, 12 Sep 2024 22:07:09 +0000
Subject: [PATCH 05/11] group mapping logic into one function

---
 .../export_bpc_selected_cases.R               | 61 +++++++++++--------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index 15b2423f..b3187c69 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o
   stop("Usage: Rscript export_bpc_selected_cases.R -h")
 }
 
-in_file <- opt$input
-out_folder <- opt$output
-phase <- opt$phase
-cohort <- opt$cohort
-site <- opt$site
-release <- opt$release
+in_file <- 'syn62828306'
+out_folder <- 'syn62828556'
+phase <- 'phase 1'
+cohort <- 'NSCLC'
+site <- 'DFCI'
+release <- '17.2-consortium'
 
 # check user input -----------------
 
@@ -120,26 +120,35 @@ existing_patients = selected_cases[selected_cases %in% clinical$patient_id]
 samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_cases]
 
 print("map data for each instrument")
-# mapping data for each instrument
-# instrument - patient_characteristics
-patient_output <- data.frame("record_id" = existing_patients)
-patient_output$redcap_repeat_instrument <- rep("")
-patient_output$redcap_repeat_instance <- rep("")
-
-patient_output$genie_patient_id <- patient_output$record_id
-patient_output$birth_year <- clinical$birth_year[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
-patient_output$naaccr_sex_code <- clinical$sex_detailed[match(patient_output$genie_patient_id, clinical$patient_id)]
-
-# mapping to code
-patient_output$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_output$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)]
-patient_output$naaccr_race_code_primary <- race_mapping$CODE[match(patient_output$naaccr_race_code_primary, race_mapping$DESCRIPTION)]
-patient_output$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_output$naaccr_race_code_secondary, race_mapping$DESCRIPTION)]
-patient_output$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_output$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)]
-patient_output$naaccr_sex_code <- sex_mapping$CODE[match(patient_output$naaccr_sex_code,sex_mapping$DESCRIPTION)]
+#'  mapping data for instrument - patient_characteristics
+#' 
+#' @param clinical A data frame of released clinical data for selected cases
+#' @param existing_patients A data frame of available patient after case selection
+#' @return A data frame with mapped code
+remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){
+  
+  patient_df <- data.frame("record_id" = existing_patients)
+  patient_df$redcap_repeat_instrument <- rep("")
+  patient_df$redcap_repeat_instance <- rep("")
+  
+  patient_df$genie_patient_id <- patient_df$record_id
+  patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  
+  # mapping to code
+  patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)]
+  patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)]
+  patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)]
+  patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)]
+  patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)]
+}
+
+patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)
+
 print("recode")
 # recode
 # cannotReleaseHIPAA = NA

From 6417c2236d6db66294f9a4bd8148a71f0826d07e Mon Sep 17 00:00:00 2001
From: Dan Lu <90745557+danlu1@users.noreply.github.com>
Date: Thu, 12 Sep 2024 15:30:25 -0700
Subject: [PATCH 06/11] Update export_bpc_selected_cases.R

---
 scripts/case_selection/export_bpc_selected_cases.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index b3187c69..664c6e65 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -42,12 +42,12 @@ if (is.null(opt$input) || is.null(opt$phase) || is.null(opt$cohort) || is.null(o
   stop("Usage: Rscript export_bpc_selected_cases.R -h")
 }
 
-in_file <- 'syn62828306'
-out_folder <- 'syn62828556'
-phase <- 'phase 1'
-cohort <- 'NSCLC'
-site <- 'DFCI'
-release <- '17.2-consortium'
+in_file <- opt$input
+out_folder <- opt$output
+phase <- opt$phase
+cohort <- opt$cohort
+site <- opt$site
+release <- opt$release
 
 # check user input -----------------
 

From 73a852ab2f49efdc3e383b815024a699803eb95e Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Fri, 13 Sep 2024 00:19:17 +0000
Subject: [PATCH 07/11] add patient characteristics remap function to
 shared_fxns and add test to it

---
 scripts/case_selection/shared_fxns.R          | 29 +++++++++++
 .../case_selection/tests/test_shared_fxns.R   | 51 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R
index 2c1a581b..0d3ca642 100644
--- a/scripts/case_selection/shared_fxns.R
+++ b/scripts/case_selection/shared_fxns.R
@@ -255,4 +255,33 @@ get_main_genie_clinical_id <- function(release){
     }
   }
   return(NULL)
+}
+
+#'  Mapping data for patient_characteristics
+#' 
+#' @param clinical A data frame of released clinical data for selected cases
+#' @param existing_patients A data frame of available patient after case selection
+#' @return A data frame with mapped code
+remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){
+  
+  patient_df <- data.frame("record_id" = existing_patients)
+  patient_df$redcap_repeat_instrument <- rep("")
+  patient_df$redcap_repeat_instance <- rep("")
+  
+  patient_df$genie_patient_id <- patient_df$record_id
+  patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
+  
+  # mapping to code
+  patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)]
+  patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)]
+  patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)]
+  patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)]
+  patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)]
+
+  return(patient_df)
 }
\ No newline at end of file
diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R
index 60aa7604..0db589ce 100644
--- a/scripts/case_selection/tests/test_shared_fxns.R
+++ b/scripts/case_selection/tests/test_shared_fxns.R
@@ -52,3 +52,54 @@ test_that("get_main_genie_clinical_id returns NULL when data_clinical.txt does n
   result <- get_main_genie_clinical_id(release)
   expect_null(result)
 })
+
+test_that("remap_patient_characteristics works as expected", {
+  
+  # Mock input data
+  clinical <- data.frame(
+    patient_id = c(1, 2, 3),
+    birth_year = c(1980, 1990, 2000),
+    ethnicity_detailed = c("Hispanic", "Non-Hispanic", "Hispanic"),
+    primary_race_detailed = c("White", "Black", "Asian"),
+    secondary_race_detailed = c("Unknown", "White", "Black"),
+    tertiary_race_detailed = c("Asian", "Unknown", "White"),
+    sex_detailed = c("Male", "Female", "Male")
+  )
+  
+  existing_patients <-  c(1, 2, 3)
+  
+  ethnicity_mapping <- data.frame(
+    DESCRIPTION = c("Hispanic", "Non-Hispanic"),
+    CODE = c("1", "2")
+  )
+  
+  race_mapping <- data.frame(
+    DESCRIPTION = c("White", "Black", "Asian", "Unknown"),
+    CODE = c("1", "2", "3", "99")
+  )
+  
+  sex_mapping <- data.frame(
+    DESCRIPTION = c("Male", "Female"),
+    CODE = c("M", "F")
+  )
+  
+  # Expected output
+  expected_output <- data.frame(
+    record_id = c(1, 2, 3),
+    redcap_repeat_instrument = c("", "", ""),
+    redcap_repeat_instance = c("", "", ""),
+    genie_patient_id = c(1, 2, 3),
+    birth_year = c(1980, 1990, 2000),
+    naaccr_ethnicity_code = c("1", "2", "1"),
+    naaccr_race_code_primary = c("1", "2", "3"),
+    naaccr_race_code_secondary = c("99", "1", "2"),
+    naaccr_race_code_tertiary = c("3", "99", "1"),
+    naaccr_sex_code = c("M", "F", "M")
+  )
+  
+  # Run the function
+  result <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)
+  
+  # Test if the output is as expected
+  expect_equal(result, expected_output)
+})

From d5ab24fa4f7d0e217eaaa63dffc563e2a1805fb4 Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Fri, 13 Sep 2024 03:35:10 +0000
Subject: [PATCH 08/11] relocate remap_patient_characteristics

---
 .../export_bpc_selected_cases.R               | 28 +------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index 664c6e65..23b530d2 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -120,33 +120,7 @@ existing_patients = selected_cases[selected_cases %in% clinical$patient_id]
 samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_cases]
 
 print("map data for each instrument")
-#'  mapping data for instrument - patient_characteristics
-#' 
-#' @param clinical A data frame of released clinical data for selected cases
-#' @param existing_patients A data frame of available patient after case selection
-#' @return A data frame with mapped code
-remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){
-  
-  patient_df <- data.frame("record_id" = existing_patients)
-  patient_df$redcap_repeat_instrument <- rep("")
-  patient_df$redcap_repeat_instance <- rep("")
-  
-  patient_df$genie_patient_id <- patient_df$record_id
-  patient_df$birth_year <- clinical$birth_year[match(patient_df$genie_patient_id, clinical$patient_id)]
-  patient_df$naaccr_ethnicity_code <- clinical$ethnicity_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
-  patient_df$naaccr_race_code_primary <- clinical$primary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
-  patient_df$naaccr_race_code_secondary <- clinical$secondary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
-  patient_df$naaccr_race_code_tertiary <- clinical$tertiary_race_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
-  patient_df$naaccr_sex_code <- clinical$sex_detailed[match(patient_df$genie_patient_id, clinical$patient_id)]
-  
-  # mapping to code
-  patient_df$naaccr_ethnicity_code <- ethnicity_mapping$CODE[match(patient_df$naaccr_ethnicity_code, ethnicity_mapping$DESCRIPTION)]
-  patient_df$naaccr_race_code_primary <- race_mapping$CODE[match(patient_df$naaccr_race_code_primary, race_mapping$DESCRIPTION)]
-  patient_df$naaccr_race_code_secondary <- race_mapping$CODE[match(patient_df$naaccr_race_code_secondary, race_mapping$DESCRIPTION)]
-  patient_df$naaccr_race_code_tertiary <- race_mapping$CODE[match(patient_df$naaccr_race_code_tertiary, race_mapping$DESCRIPTION)]
-  patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)]
-}
-
+# instrument - patient_characteristics
 patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)
 
 print("recode")

From 4919998bcda7d6a889de0394fad0f4b9743a02c4 Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Fri, 13 Sep 2024 20:03:17 +0000
Subject: [PATCH 09/11] throw warning instead in check_for_missing_values

---
 .../export_bpc_selected_cases.R               |  5 +++
 scripts/case_selection/shared_fxns.R          | 22 +++++++++++-
 .../case_selection/tests/test_shared_fxns.R   | 35 +++++++++++++++++++
 3 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/scripts/case_selection/export_bpc_selected_cases.R b/scripts/case_selection/export_bpc_selected_cases.R
index 23b530d2..faab50dd 100644
--- a/scripts/case_selection/export_bpc_selected_cases.R
+++ b/scripts/case_selection/export_bpc_selected_cases.R
@@ -122,6 +122,11 @@ samples_per_patient <- clinical$sample_id[clinical$patient_id %in% selected_case
 print("map data for each instrument")
 # instrument - patient_characteristics
 patient_output <- remap_patient_characteristics(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping)
+# check missing values
+# get naaccr code columns
+naaccr_col <- grep("naaccr", colnames(patient_output), value = TRUE)
+# error out if NAs or empty strings are detected in naaccr code columns
+check_for_missing_values(patient_output, naaccr_col)
 
 print("recode")
 # recode
diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R
index 0d3ca642..e1e49f6c 100644
--- a/scripts/case_selection/shared_fxns.R
+++ b/scripts/case_selection/shared_fxns.R
@@ -261,6 +261,9 @@ get_main_genie_clinical_id <- function(release){
 #' 
 #' @param clinical A data frame of released clinical data for selected cases
 #' @param existing_patients A data frame of available patient after case selection
+#' @param ethnicity_mapping The NAACCR_ETHNICITY_MAPPING data frame
+#' @param race_mapping The NAACCR_RACE_MAPPING data frame
+#' @param sex_mapping The NAACCR_SEX_MAPPING data frame
 #' @return A data frame with mapped code
 remap_patient_characteristics <- function(clinical, existing_patients, ethnicity_mapping, race_mapping, sex_mapping){
   
@@ -284,4 +287,21 @@ remap_patient_characteristics <- function(clinical, existing_patients, ethnicity
   patient_df$naaccr_sex_code <- sex_mapping$CODE[match(patient_df$naaccr_sex_code,sex_mapping$DESCRIPTION)]
 
   return(patient_df)
-}
\ No newline at end of file
+}
+
+#' Check for missing values
+#'
+#' @param data The data frame to check against
+#' @param columns The target columns
+check_for_missing_values <- function(data, columns) {
+  # Check for NA values or empty strings
+  missingness_col <- c()
+  for (col in columns) {
+    if (any(is.na(data[[col]]) | data[[col]] == "" )){
+      missingness_col <- c(col, missingness_col)
+    }
+  }
+  if (length(missingness_col) > 0) {
+    warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", ")))
+    }
+}
diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R
index 0db589ce..dbde090f 100644
--- a/scripts/case_selection/tests/test_shared_fxns.R
+++ b/scripts/case_selection/tests/test_shared_fxns.R
@@ -103,3 +103,38 @@ test_that("remap_patient_characteristics works as expected", {
   # Test if the output is as expected
   expect_equal(result, expected_output)
 })
+
+test_that("check_for_missing_values - no missing or empty values in the data", {
+  data <- data.frame(
+    col1 = c(1, 2, 3),
+    col2 = c("a", "b", "c")
+  )
+  expect_warning(check_for_missing_values(data, c("col1", "col2")), NA)
+})
+
+test_that("check_for_missing_values - NAs are detected", {
+  data <- data.frame(
+    col1 = c(1, NA, 3),
+    col2 = c("a", "b", "c")
+  )
+  expect_warning(check_for_missing_values(data, c("col1", "col2")), 
+              "Warning: Missing or empty values found in column\\(s\\): col1")
+})
+
+test_that("check_for_missing_values - empty string values are detected", {
+  data <- data.frame(
+    col1 = c(1, 2, 3),
+    col2 = c("a", "", "c")
+  )
+  expect_warning(check_for_missing_values(data, c("col1", "col2")), 
+               "Warning: Missing or empty values found in column\\(s\\): col2")
+})
+
+test_that("check_for_missing_values - multiple missing and empty values are detected", {
+  data <- data.frame(
+    col1 = c(1, NA, 3),
+    col2 = c("a", "", "c")
+  )
+  expect_warning(check_for_missing_values(data, c("col1", "col2")), 
+               "Warning: Missing or empty values found in column\\(s\\): col2, col1")
+})
\ No newline at end of file

From 65f371dda6c198e32a674bcfc5729c5cce628e99 Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Fri, 13 Sep 2024 22:14:12 +0000
Subject: [PATCH 10/11] filter out CHOP, PROV, JHU rows when checking
 missingness

---
 scripts/case_selection/shared_fxns.R          |  4 ++-
 .../case_selection/tests/test_shared_fxns.R   | 34 +++++++++++++------
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R
index e1e49f6c..08688c9c 100644
--- a/scripts/case_selection/shared_fxns.R
+++ b/scripts/case_selection/shared_fxns.R
@@ -289,11 +289,13 @@ remap_patient_characteristics <- function(clinical, existing_patients, ethnicity
   return(patient_df)
 }
 
-#' Check for missing values
+#' Check for missing values in naaccr columns
 #'
 #' @param data The data frame to check against
 #' @param columns The target columns
 check_for_missing_values <- function(data, columns) {
+  # filter out CHOP, PROV, JHU centers with known NAs
+  data <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ]
   # Check for NA values or empty strings
   missingness_col <- c()
   for (col in columns) {
diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R
index dbde090f..93e5ecf7 100644
--- a/scripts/case_selection/tests/test_shared_fxns.R
+++ b/scripts/case_selection/tests/test_shared_fxns.R
@@ -104,37 +104,51 @@ test_that("remap_patient_characteristics works as expected", {
   expect_equal(result, expected_output)
 })
 
-test_that("check_for_missing_values - no missing or empty values in the data", {
+test_that("check_for_missing_values - no missing or empty values in centers other than CHOP, PROV, JHU", {
   data <- data.frame(
-    col1 = c(1, 2, 3),
-    col2 = c("a", "b", "c")
+    col1 = c(1, 2, 3, NA),
+    col2 = c("a", "b", "c", ""),
+    genie_patient_id = c('a', 'b', 'c', 'CHOP123')
   )
   expect_warning(check_for_missing_values(data, c("col1", "col2")), NA)
+
 })
 
-test_that("check_for_missing_values - NAs are detected", {
+test_that("check_for_missing_values - NAs are detected in centers other than CHOP, PROV, JHU", {
   data <- data.frame(
     col1 = c(1, NA, 3),
-    col2 = c("a", "b", "c")
+    col2 = c("a", "b", "c"),
+    genie_patient_id = c('CHOP123', 'b', 'PROV234')
   )
   expect_warning(check_for_missing_values(data, c("col1", "col2")), 
               "Warning: Missing or empty values found in column\\(s\\): col1")
 })
 
-test_that("check_for_missing_values - empty string values are detected", {
+test_that("check_for_missing_values - empty string values are detected in centers other than CHOP, PROV, JHU", {
   data <- data.frame(
     col1 = c(1, 2, 3),
-    col2 = c("a", "", "c")
+    col2 = c("a", "", "c"),
+    genie_patient_id = c('CHOP123', 'b', 'PROV234')
   )
   expect_warning(check_for_missing_values(data, c("col1", "col2")), 
                "Warning: Missing or empty values found in column\\(s\\): col2")
 })
 
-test_that("check_for_missing_values - multiple missing and empty values are detected", {
+test_that("check_for_missing_values - multiple missing and empty values are detected in centers other than CHOP, PROV, JHU", {
   data <- data.frame(
-    col1 = c(1, NA, 3),
-    col2 = c("a", "", "c")
+    col1 = c(1, NA, ""),
+    col2 = c("a", "", "c"),
+    genie_patient_id = c('CHOP123', 'b', 'PROV234')
   )
   expect_warning(check_for_missing_values(data, c("col1", "col2")), 
                "Warning: Missing or empty values found in column\\(s\\): col2, col1")
+})
+
+test_that("check_for_missing_values - multiple missing and empty values are detected in CHOP, PROV, JHU centers", {
+  data <- data.frame(
+    col1 = c(1, NA, 2),
+    col2 = c("a", "", "c"),
+    genie_patient_id = c('a', 'CHOP123', 'PROV234')
+  )
+  expect_warning(check_for_missing_values(data, c("col1", "col2")), NA)
 })
\ No newline at end of file

From 577b2995244c3e4b80d75032d269aaafb03b3f8d Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Mon, 16 Sep 2024 17:30:29 +0000
Subject: [PATCH 11/11] CHOP|PROV|JHU only if checking for missiness in NAACCR
 code columns

---
 scripts/case_selection/shared_fxns.R          | 12 +++--
 .../case_selection/tests/test_shared_fxns.R   | 51 +++++++------------
 2 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/scripts/case_selection/shared_fxns.R b/scripts/case_selection/shared_fxns.R
index 08688c9c..c70e9d5f 100644
--- a/scripts/case_selection/shared_fxns.R
+++ b/scripts/case_selection/shared_fxns.R
@@ -294,15 +294,19 @@ remap_patient_characteristics <- function(clinical, existing_patients, ethnicity
 #' @param data The data frame to check against
 #' @param columns The target columns
 check_for_missing_values <- function(data, columns) {
-  # filter out CHOP, PROV, JHU centers with known NAs
-  data <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ]
   # Check for NA values or empty strings
   missingness_col <- c()
   for (col in columns) {
-    if (any(is.na(data[[col]]) | data[[col]] == "" )){
+    if (col %in% c("naaccr_race_code_tertiary", "naaccr_race_code_secondary")) {
+      # filter out CHOP, PROV, JHU centers with known NAs in NAACCR code columns
+      relevant_rows <- data[!grepl("CHOP|PROV|JHU", data$genie_patient_id), ]
+    } else{
+      relevant_rows <- data
+    }
+    if (any(is.na(relevant_rows[[col]]) | relevant_rows[[col]] == "" )){
       missingness_col <- c(col, missingness_col)
     }
-  }
+    }
   if (length(missingness_col) > 0) {
     warning(paste0("Warning: Missing or empty values found in column(s): ", paste(missingness_col,collapse=", ")))
     }
diff --git a/scripts/case_selection/tests/test_shared_fxns.R b/scripts/case_selection/tests/test_shared_fxns.R
index 93e5ecf7..fdd7fdb3 100644
--- a/scripts/case_selection/tests/test_shared_fxns.R
+++ b/scripts/case_selection/tests/test_shared_fxns.R
@@ -104,51 +104,38 @@ test_that("remap_patient_characteristics works as expected", {
   expect_equal(result, expected_output)
 })
 
-test_that("check_for_missing_values - no missing or empty values in centers other than CHOP, PROV, JHU", {
+test_that("check_for_missing_values - no missing or empty values", {
   data <- data.frame(
-    col1 = c(1, 2, 3, NA),
-    col2 = c("a", "b", "c", ""),
-    genie_patient_id = c('a', 'b', 'c', 'CHOP123')
+    col1 = c(1, 2, 3),
+    col2 = c("a", "b", "c"),
+    genie_patient_id = c("a", "b", "CHOP123"),
+    naaccr_race_code_tertiary = c("a", "b", "c"),
+    naaccr_race_code_secondary = c("a", "b", "c")
   )
-  expect_warning(check_for_missing_values(data, c("col1", "col2")), NA)
+  expect_no_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")))
 
 })
 
-test_that("check_for_missing_values - NAs are detected in centers other than CHOP, PROV, JHU", {
+test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in centers other than CHOP, PROV, JHU", {
   data <- data.frame(
-    col1 = c(1, NA, 3),
+    col1 = c(1, NA, ""),
     col2 = c("a", "b", "c"),
-    genie_patient_id = c('CHOP123', 'b', 'PROV234')
+    genie_patient_id = c("CHOP123", "b", "PROV234"),
+    naaccr_race_code_tertiary = c("a", "", "c"),
+    naaccr_race_code_secondary = c("a", "b", "c")
   )
-  expect_warning(check_for_missing_values(data, c("col1", "col2")), 
-              "Warning: Missing or empty values found in column\\(s\\): col1")
+  expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), 
+              "Warning: Missing or empty values found in column\\(s\\): naaccr_race_code_tertiary, col1")
 })
 
-test_that("check_for_missing_values - empty string values are detected in centers other than CHOP, PROV, JHU", {
-  data <- data.frame(
-    col1 = c(1, 2, 3),
-    col2 = c("a", "", "c"),
-    genie_patient_id = c('CHOP123', 'b', 'PROV234')
-  )
-  expect_warning(check_for_missing_values(data, c("col1", "col2")), 
-               "Warning: Missing or empty values found in column\\(s\\): col2")
-})
-
-test_that("check_for_missing_values - multiple missing and empty values are detected in centers other than CHOP, PROV, JHU", {
+test_that("check_for_missing_values - missingness values are detected in NAACCR code columns in CHOP, PROV, JHU centers", {
   data <- data.frame(
     col1 = c(1, NA, ""),
     col2 = c("a", "", "c"),
-    genie_patient_id = c('CHOP123', 'b', 'PROV234')
+    genie_patient_id = c("CHOP123", "b", "PROV234"),
+    naaccr_race_code_tertiary = c("", "b", "c"),
+    naaccr_race_code_secondary = c("a", "b", NA)
   )
-  expect_warning(check_for_missing_values(data, c("col1", "col2")), 
+  expect_warning(check_for_missing_values(data, c("col1", "col2", "naaccr_race_code_tertiary", "naaccr_race_code_secondary")), 
                "Warning: Missing or empty values found in column\\(s\\): col2, col1")
 })
-
-test_that("check_for_missing_values - multiple missing and empty values are detected in CHOP, PROV, JHU centers", {
-  data <- data.frame(
-    col1 = c(1, NA, 2),
-    col2 = c("a", "", "c"),
-    genie_patient_id = c('a', 'CHOP123', 'PROV234')
-  )
-  expect_warning(check_for_missing_values(data, c("col1", "col2")), NA)
-})
\ No newline at end of file