Skip to content

Commit

Permalink
Remove time-shifted data
Browse files Browse the repository at this point in the history
  • Loading branch information
emilycantrell committed Dec 5, 2024
1 parent 1fd2d62 commit 9b2acb9
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 53 deletions.
Binary file modified model.rds
Binary file not shown.
15 changes: 0 additions & 15 deletions submission.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@ clean_df <- function(df, background_df) {
# Returns:
# data frame: The cleaned dataframe with only the necessary columns and processed variables.

#### TIME-SHIFTED DATA INDICATOR ####
# The time shifted data already has a column called time_shifted_data, where
# time_shifted_data = 1. For the regular data, we need to create time_shifted_data = 0.
if (!"time_shifted_data" %in% colnames(df)) {
df <- df %>%
mutate(time_shifted_data = 0)
}

#### MERGE IN PARTNER DATA IF THE PARTNER ALSO PARTICIPATED IN THE SURVEY ####
# Make a vector of features to merge in from the partner's survey, for use in modeling
features_to_use_as_partner_data_in_model <- c(
Expand Down Expand Up @@ -120,12 +112,6 @@ clean_df <- function(df, background_df) {
# not have all household IDs.
background_df20 <- background_df

# If this is time-shifted data, filter the background data to 2017 and earlier
if(unique(df$time_shifted_data) == 1) {
background_df <- background_df %>%
filter(wave <= 201712)
}

# For each person, filter to only the most recent wave in which they appeared
background_most_recent_wave <- background_df %>%
group_by(nomem_encr) %>%
Expand Down Expand Up @@ -197,7 +183,6 @@ clean_df <- function(df, background_df) {
keepcols <- c(
"nomem_encr", # ID variable required for predictions,
"outcome_available", # Is there an outcome to predict?
"time_shifted_data", # Indicates whether this is original data or time-shifted data
"partner_survey_available", # Indicates whether we merged in data from partner who also participated in survey
# Savings
"ca20g012", "ca20g013", "ca20g078",
Expand Down
43 changes: 5 additions & 38 deletions training.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,22 @@ library(tidyr)
library(tidymodels)
library(xgboost)

train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
cleaned_train_2018to2020, outcome_2018to2020) {
train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023) {
# Trains a model using the cleaned dataframe and saves the model to a file.

# Parameters (all of these are dataframes):
# cleaned_train_2021to2023: PreFer_train_data.csv after it has gone through the clean_df function
# outcome_2021to2023: PreFer_train_outcome.csv
# cleaned_train_2018to2020: A "time-shifted" dataframe of feature data, after it has gone through the clean_df function
# outcome_2018to2020: Outcome data for fertility in 2018-2020

set.seed(0)

# Combine cleaned training data and outcome_df
model_df_2021to2023 <- merge(cleaned_train_2021to2023, outcome_2021to2023, by = "nomem_encr") %>%
mutate(new_child = factor(new_child))

model_df_2018to2020 <- merge(cleaned_train_2018to2020, outcome_2018to2020, by = "nomem_encr") %>%
mutate(new_child = factor(new_child))

original_plus_timeshifted_model_df <- bind_rows(model_df_2021to2023, model_df_2018to2020)

# Set up a recipe that remove the ids, dummy-encode the categorical variables
# and mean impute everything
recipe <- recipe(new_child ~ ., original_plus_timeshifted_model_df) %>%
recipe <- recipe(new_child ~ ., model_df_2021to2023) %>%
step_rm(nomem_encr, nohouse_encr) %>%
step_mutate(across(c(cf18k128, cf19l128, cf20m128,
cf20m128_PartnerSurvey, cf19l128_PartnerSurvey,
Expand All @@ -54,39 +46,14 @@ train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
set_engine("xgboost", counts = FALSE)
# Set up cross-validation folds

# Set up CV folds within the original data
# Set up CV folds
n_folds <- 5
folds <- filter(original_plus_timeshifted_model_df, time_shifted_data == 0
) %>%
folds <- model_df_2021to2023 %>%
group_vfold_cv(
group = nohouse_encr, # Puts household members in same fold as each other
balance = "observations",
v = n_folds
)
# Within each CV fold, append time-shifted data.
# Note: We are appending time-shifted data here rather than prior to creating the
# CV folds because we only want time-shifted data in training folds, not in test folds.
# We then make sure that the time-shifted people we append do not come from the same
# households as those in the test folds
for (i in 1:n_folds) {
# Identify what index the first time-shifted observation will be placed at
start_index <- nrow(folds$splits[[i]][[1]]) + 1
# Append the time-shifted data but exclude those in the same households as
# in the test fold
test_fold <- folds$splits[[i]][[1]][-folds$splits[[i]][[2]], ]
folds$splits[[i]][[1]] <- bind_rows(
folds$splits[[i]][[1]],
filter(original_plus_timeshifted_model_df,
time_shifted_data == 1,
!nohouse_encr %in% test_fold$nohouse_encr
)
)
# Add the indices for time-shifted data to the vector of train fold indices
end_index <- nrow(folds$splits[[i]][[1]])
time_shifted_data_indices <- c(start_index:end_index)
folds$splits[[i]][[2]] <-
c(folds$splits[[i]][[2]], time_shifted_data_indices)
}

# Grid search for hyperparameter tuning
grid <- expand.grid(
Expand Down Expand Up @@ -115,7 +82,7 @@ train_save_model <- function(cleaned_train_2021to2023, outcome_2021to2023,
model <- workflow() %>%
add_model(model) %>%
add_recipe(recipe) %>%
fit(original_plus_timeshifted_model_df)
fit(model_df_2021to2023)

# Save the model
saveRDS(model, "model.rds")
Expand Down

0 comments on commit 9b2acb9

Please sign in to comment.