Remove time-shifted data

citp · Dec 5, 2024 · 9b2acb9 · 9b2acb9
1 parent 1fd2d62
commit 9b2acb9
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 53 deletions.
diff --git a/model.rds b/model.rds
diff --git a/submission.R b/submission.R
@@ -30,14 +30,6 @@ clean_df <- function(df, background_df) {
   # Returns:
   # data frame: The cleaned dataframe with only the necessary columns and processed variables.
 
-  #### TIME-SHIFTED DATA INDICATOR ####
-  # The time shifted data already has a column called time_shifted_data, where
-  # time_shifted_data = 1. For the regular data, we need to create time_shifted_data = 0.
-  if (!"time_shifted_data" %in% colnames(df)) {
-    df <- df %>%
-      mutate(time_shifted_data = 0)
-  }
-
   #### MERGE IN PARTNER DATA IF THE PARTNER ALSO PARTICIPATED IN THE SURVEY ####
   # Make a vector of features to merge in from the partner's survey, for use in modeling
   features_to_use_as_partner_data_in_model <- c(
@@ -120,12 +112,6 @@ clean_df <- function(df, background_df) {
   # not have all household IDs.
   background_df20 <- background_df
 
-  # If this is time-shifted data, filter the background data to 2017 and earlier
-  if(unique(df$time_shifted_data) == 1) { 
-    background_df <- background_df %>%
-      filter(wave <= 201712)
-  }
-
   # For each person, filter to only the most recent wave in which they appeared
   background_most_recent_wave <- background_df %>%
     group_by(nomem_encr) %>%
@@ -197,7 +183,6 @@ clean_df <- function(df, background_df) {
   keepcols <- c(
     "nomem_encr", # ID variable required for predictions,
     "outcome_available", # Is there an outcome to predict?
-    "time_shifted_data", # Indicates whether this is original data or time-shifted data
     "partner_survey_available", # Indicates whether we merged in data from partner who also participated in survey
     # Savings
     "ca20g012", "ca20g013", "ca20g078",

diff --git a/training.R b/training.R
@@ -12,30 +12,22 @@ library(tidyr)
 library(tidymodels)
 library(xgboost)
 
-train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023, 
-                             cleaned_train_2018to2020, outcome_2018to2020) {
+train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023) {
   # Trains a model using the cleaned dataframe and saves the model to a file.
 
   # Parameters (all of these are dataframes):
   # cleaned_train_2021to2023: PreFer_train_data.csv after it has gone through the clean_df function 
   # outcome_2021to2023: PreFer_train_outcome.csv 
-  # cleaned_train_2018to2020: A "time-shifted" dataframe of feature data, after it has gone through the clean_df function
-  # outcome_2018to2020: Outcome data for fertility in 2018-2020
 
   set.seed(0)
 
   # Combine cleaned training data and outcome_df
   model_df_2021to2023 <- merge(cleaned_train_2021to2023, outcome_2021to2023, by = "nomem_encr") %>%
     mutate(new_child = factor(new_child))
 
-  model_df_2018to2020 <- merge(cleaned_train_2018to2020, outcome_2018to2020, by = "nomem_encr") %>%
-    mutate(new_child = factor(new_child))
-
-  original_plus_timeshifted_model_df <- bind_rows(model_df_2021to2023, model_df_2018to2020)
-
   # Set up a recipe that remove the ids, dummy-encode the categorical variables 
   # and mean impute everything
-  recipe <- recipe(new_child ~ ., original_plus_timeshifted_model_df) %>%
+  recipe <- recipe(new_child ~ ., model_df_2021to2023) %>%
     step_rm(nomem_encr, nohouse_encr) %>%
     step_mutate(across(c(cf18k128, cf19l128, cf20m128,
         cf20m128_PartnerSurvey, cf19l128_PartnerSurvey,
@@ -54,39 +46,14 @@ train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
     set_engine("xgboost", counts = FALSE)
   # Set up cross-validation folds
 
-  # Set up CV folds within the original data
+  # Set up CV folds
   n_folds <- 5
-  folds <- filter(original_plus_timeshifted_model_df, time_shifted_data == 0
-  ) %>%
+  folds <- model_df_2021to2023 %>%
     group_vfold_cv(
       group = nohouse_encr, # Puts household members in same fold as each other
       balance = "observations",
       v = n_folds
     )
-  # Within each CV fold, append time-shifted data.
-  # Note: We are appending time-shifted data here rather than prior to creating the
-  # CV folds because we only want time-shifted data in training folds, not in test folds.
-  # We then make sure that the time-shifted people we append do not come from the same
-  # households as those in the test folds
-  for (i in 1:n_folds) {
-    # Identify what index the first time-shifted observation will be placed at
-    start_index <- nrow(folds$splits[[i]][[1]]) + 1
-    # Append the time-shifted data but exclude those in the same households as
-    # in the test fold
-    test_fold <- folds$splits[[i]][[1]][-folds$splits[[i]][[2]], ]
-    folds$splits[[i]][[1]] <- bind_rows(
-      folds$splits[[i]][[1]],
-      filter(original_plus_timeshifted_model_df,
-        time_shifted_data == 1,
-        !nohouse_encr %in% test_fold$nohouse_encr
-      )
-    )
-    # Add the indices for time-shifted data to the vector of train fold indices
-    end_index <- nrow(folds$splits[[i]][[1]])
-    time_shifted_data_indices <- c(start_index:end_index)
-    folds$splits[[i]][[2]] <-
-      c(folds$splits[[i]][[2]], time_shifted_data_indices)
-  }
 
   # Grid search for hyperparameter tuning
   grid <- expand.grid(
@@ -115,7 +82,7 @@ train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
   model <- workflow() %>%
     add_model(model) %>%
     add_recipe(recipe) %>%
-    fit(original_plus_timeshifted_model_df)
+    fit(model_df_2021to2023)
 
   # Save the model
   saveRDS(model, "model.rds")