DHSC-Capstone/ML/2-modeling.R

rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.
cat("\014") # Clear the console


# load packages -----------------------------------------------------------

box::use(
  magrittr[`%>%`]
  ,here[here]
  ,readr
  ,gp2 = ggplot2[ggplot, aes]
  ,rsample
  ,r = recipes
  ,wf = workflows
  ,p = parsnip[tune]
  ,ys = yardstick
  ,d = dials
  ,rsamp = rsample
)


# globals -----------------------------------------------------------------

set.seed(070823) #set seed for reproducible research


# load-data ---------------------------------------------------------------

model_data <- readr$read_rds(here("ML","data-unshared","model_data.RDS"))


# split data --------------------------------------------------------------

model_data_split <- rsample$initial_split(
  model_data
  ,prop   = 0.80
  ,strata = ft4_dia
  )

ds_train <- rsample$training(model_data_split)
ds_test  <- rsample$testing(model_data_split)

# verify distribution of data
table(ds_train$ft4_dia) %>% prop.table()
table(ds_test$ft4_dia) %>% prop.table()


class_train <- ds_train %>% dplyr::select(-FT4)  # training data for classification models
reg_train <- ds_train %>% dplyr::select(-ft4_dia) # training data for reg models predicting result

# random forest classification -----------------------------------------------------------

# base model - No Hyper Tuning

rf__base_model <- p$rand_forest() %>%
  p$set_engine("ranger") %>% p$set_mode("classification")

rf_recipe <- r$recipe(ft4_dia ~ . , data = class_train) %>%
  r$update_role(subject_id, new_role = "id") %>%
  r$update_role(charttime, new_role = "time") %>%
  r$step_impute_bag(r$all_predictors())


rf_workflow <- wf$workflow() %>%
  wf$add_model(rf__base_model) %>%
  wf$add_recipe(rf_recipe)

rf_base_fit <- p$fit(rf_workflow, class_train)

rf_predict <- class_train %>%
  dplyr::select(ft4_dia) %>%
  dplyr::bind_cols(
    predict(rf_base_fit, class_train)
    ,predict(rf_base_fit, class_train, type = "prob")
    )

conf_mat_rf <- ys$conf_mat(rf_predict, ft4_dia, .pred_class)


rf_pred <- dplyr::select(class_train, -ft4_dia, -subject_id, -charttime)

rf_tuning_model <- p$rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>%
  p$set_engine("ranger") %>% p$set_mode("classification")

rf_param <- p$extract_parameter_set_dials(rf_tuning_model)

rf_param <- rf_param %>% update(mtry = d$finalize(d$mtry(), rf_pred))

data_fold <- rsamp$vfold_cv(class_train, v = 5)

rf_workflow <- wf$update_model(rf_workflow, rf_tuning_model)

# takes around 1 hr to run grid search.  saving best params manaually
# rf_tune <- rf_workflow %>%
#     tune::tune_grid(
#     data_fold
#     ,grid = rf_param %>% d$grid_regular()
#   )

rf_best_params <- tibble::tibble(
  mtry = 8
  ,trees = 2000
  ,min_n = 2
)

final_rf_workflow <- rf_workflow %>%
  tune::finalize_workflow(rf_best_params)

final_rf_fit <- p$fit(final_rf_workflow, class_train)

final_rf_predict <- class_train %>%
  dplyr::select(ft4_dia) %>%
  dplyr::bind_cols(
    predict(final_rf_fit, class_train)
    ,predict(final_rf_fit, class_train, type = "prob")
  )

final_conf_rf <- ys$conf_mat(final_rf_predict, ft4_dia, .pred_class)


# random forest regression ------------------------------------------------
#
# reg_metrics <- ys$metric_set(ys$rmse, ys$rsq, ys$mae)
#
# rf_base_reg_model <- p$rand_forest() %>%
#   p$set_engine("ranger") %>% p$set_mode("regression")
#
# rf_reg_recipe <- r$recipe(FT4 ~ . , data = reg_train) %>%
#   r$update_role(subject_id, new_role = "id") %>%
#   r$update_role(charttime, new_role = "time") %>%
#   r$step_impute_bag(r$all_predictors())
#
#
# rf_reg_workflow <- wf$workflow() %>%
#   wf$add_model(rf_base_reg_model) %>%
#   wf$add_recipe(rf_reg_recipe)
#
# rf_base_reg_fit <- p$fit(rf_reg_workflow, reg_train)
#
# rf_reg_predict <- reg_train %>%
#   dplyr::select(FT4) %>%
#   dplyr::bind_cols(
#     predict(rf_base_reg_fit, reg_train)
#   )
#
# reg_metrics(rf_reg_predict, truth = FT4, estimate = .pred)
#
# rf_reg_tune_model <- p$rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>%
#   p$set_engine("ranger") %>% p$set_mode("regression")
#
# rf_reg_pred <- dplyr::select(reg_train, -FT4, -subject_id, -charttime)
#
# rf_reg_param <- p$extract_parameter_set_dials(rf_reg_tune_model) %>%
#   update(mtry = d$finalize(d$mtry(), rf_reg_pred))
#
# data_fold_reg <- rsamp$vfold_cv(reg_train, v = 5)
#
# rf_reg_workflow <- wf$update_model(rf_reg_workflow, rf_reg_tune_model)
#
# # takes around 1 hr to run grid search.  saving best params manaually
# # rf_reg_tune <- rf_reg_workflow %>%
# #     tune::tune_grid(
# #     data_fold_reg
# #     ,grid = rf_reg_param %>% d$grid_regular()
# #   )
#
# rf_reg_best_params <- tibble::tibble(
#   mtry = 8
#   ,trees = 1000
#   ,min_n = 2
# )
#
# final_rf_reg_workflow <- rf_reg_workflow %>%
#   tune::finalize_workflow(rf_reg_best_params)
#
# final_rf_reg_fit <- p$fit(final_rf_reg_workflow, reg_train)
#
# final_rf_reg_predict <-  reg_train %>%
#   dplyr::select(FT4) %>%
#   dplyr::bind_cols(
#     predict(final_rf_reg_fit, reg_train)
#   )
#
# reg_metrics(final_rf_reg_predict, truth = FT4, estimate = .pred)
Create 2-modeling.R 2023-01-22 15:26:44 -05:00			`rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.`
			`cat("\014") # Clear the console`


			`# load packages -----------------------------------------------------------`

			`box::use(`
			magrittr[`%>%`]
			`,here[here]`
			`,readr`
			`,gp2 = ggplot2[ggplot, aes]`
Update 2-modeling.R 2023-01-22 15:38:44 -05:00			`,rsample`
Update 2-modeling.R 2023-01-31 09:02:19 -05:00			`,r = recipes`
			`,wf = workflows`
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`,p = parsnip[tune]`
Update 2-modeling.R 2023-01-31 15:29:27 -05:00			`,ys = yardstick`
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`,d = dials`
			`,rsamp = rsample`
Create 2-modeling.R 2023-01-22 15:26:44 -05:00			`)`

Update 2-modeling.R 2023-01-22 15:38:44 -05:00

			`# globals -----------------------------------------------------------------`

			`set.seed(070823) #set seed for reproducible research`


			`# load-data ---------------------------------------------------------------`

			`model_data <- readr$read_rds(here("ML","data-unshared","model_data.RDS"))`


			`# split data --------------------------------------------------------------`

			`model_data_split <- rsample$initial_split(`
			`model_data`
			`,prop = 0.80`
			`,strata = ft4_dia`
			`)`

			`ds_train <- rsample$training(model_data_split)`
			`ds_test <- rsample$testing(model_data_split)`

			`# verify distribution of data`
			`table(ds_train$ft4_dia) %>% prop.table()`
			`table(ds_test$ft4_dia) %>% prop.table()`

Update 2-modeling.R 2023-01-31 09:02:19 -05:00
Update 2-modeling.R 2023-02-01 11:00:57 -05:00			`class_train <- ds_train %>% dplyr::select(-FT4) # training data for classification models`
			`reg_train <- ds_train %>% dplyr::select(-ft4_dia) # training data for reg models predicting result`
Update 2-modeling.R 2023-01-31 09:02:19 -05:00
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`# random forest classification -----------------------------------------------------------`
Update 2-modeling.R 2023-01-31 09:02:19 -05:00
Update 2-modeling.R 2023-02-01 11:00:57 -05:00			`# base model - No Hyper Tuning`
Update 2-modeling.R 2023-01-31 15:29:27 -05:00
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`rf__base_model <- p$rand_forest() %>%`
Update 2-modeling.R 2023-02-01 11:00:57 -05:00			`p$set_engine("ranger") %>% p$set_mode("classification")`
Update 2-modeling.R 2023-01-31 09:02:19 -05:00
Update 2-modeling.R 2023-02-01 11:00:57 -05:00			`rf_recipe <- r$recipe(ft4_dia ~ . , data = class_train) %>%`
Update 2-modeling.R 2023-01-31 09:02:19 -05:00			`r$update_role(subject_id, new_role = "id") %>%`
			`r$update_role(charttime, new_role = "time") %>%`
update data 2023-01-31 19:33:24 -05:00			`r$step_impute_bag(r$all_predictors())`
Update 2-modeling.R 2023-01-31 09:02:19 -05:00

			`rf_workflow <- wf$workflow() %>%`
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`wf$add_model(rf__base_model) %>%`
Update 2-modeling.R 2023-01-31 09:02:19 -05:00			`wf$add_recipe(rf_recipe)`

Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`rf_base_fit <- p$fit(rf_workflow, class_train)`
Update 2-modeling.R 2023-01-31 15:29:27 -05:00
Update 2-modeling.R 2023-02-01 11:00:57 -05:00			`rf_predict <- class_train %>%`
			`dplyr::select(ft4_dia) %>%`
			`dplyr::bind_cols(`
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`predict(rf_base_fit, class_train)`
			`,predict(rf_base_fit, class_train, type = "prob")`
Update 2-modeling.R 2023-02-01 11:00:57 -05:00			`)`
Update 2-modeling.R 2023-01-31 16:04:06 -05:00
Update 2-modeling.R 2023-02-01 11:00:57 -05:00			`conf_mat_rf <- ys$conf_mat(rf_predict, ft4_dia, .pred_class)`
Update 2-modeling.R 2023-01-31 16:04:06 -05:00

update data 2023-01-31 19:33:24 -05:00
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`rf_pred <- dplyr::select(class_train, -ft4_dia, -subject_id, -charttime)`

			`rf_tuning_model <- p$rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>%`
			`p$set_engine("ranger") %>% p$set_mode("classification")`

			`rf_param <- p$extract_parameter_set_dials(rf_tuning_model)`

			`rf_param <- rf_param %>% update(mtry = d$finalize(d$mtry(), rf_pred))`

			`data_fold <- rsamp$vfold_cv(class_train, v = 5)`

			`rf_workflow <- wf$update_model(rf_workflow, rf_tuning_model)`

Update 2-modeling.R 2023-02-02 20:50:30 -05:00			`# takes around 1 hr to run grid search. saving best params manaually`
			`# rf_tune <- rf_workflow %>%`
			`# tune::tune_grid(`
			`# data_fold`
			`# ,grid = rf_param %>% d$grid_regular()`
			`# )`

			`rf_best_params <- tibble::tibble(`
			`mtry = 8`
			`,trees = 2000`
			`,min_n = 2`
			`)`

			`final_rf_workflow <- rf_workflow %>%`
			`tune::finalize_workflow(rf_best_params)`

			`final_rf_fit <- p$fit(final_rf_workflow, class_train)`

			`final_rf_predict <- class_train %>%`
			`dplyr::select(ft4_dia) %>%`
			`dplyr::bind_cols(`
			`predict(final_rf_fit, class_train)`
			`,predict(final_rf_fit, class_train, type = "prob")`
Update 2-modeling.R 2023-02-02 13:23:28 -05:00			`)`

Update 2-modeling.R 2023-02-02 20:50:30 -05:00			`final_conf_rf <- ys$conf_mat(final_rf_predict, ft4_dia, .pred_class)`
Update 2-modeling.R 2023-02-03 14:37:02 -05:00

			`# random forest regression ------------------------------------------------`
add new script 2023-03-17 16:13:34 -04:00			`#`
			`# reg_metrics <- ys$metric_set(ys$rmse, ys$rsq, ys$mae)`
			`#`
			`# rf_base_reg_model <- p$rand_forest() %>%`
			`# p$set_engine("ranger") %>% p$set_mode("regression")`
			`#`
			`# rf_reg_recipe <- r$recipe(FT4 ~ . , data = reg_train) %>%`
			`# r$update_role(subject_id, new_role = "id") %>%`
			`# r$update_role(charttime, new_role = "time") %>%`
			`# r$step_impute_bag(r$all_predictors())`
			`#`
			`#`
			`# rf_reg_workflow <- wf$workflow() %>%`
			`# wf$add_model(rf_base_reg_model) %>%`
			`# wf$add_recipe(rf_reg_recipe)`
			`#`
			`# rf_base_reg_fit <- p$fit(rf_reg_workflow, reg_train)`
			`#`
			`# rf_reg_predict <- reg_train %>%`
			`# dplyr::select(FT4) %>%`
			`# dplyr::bind_cols(`
			`# predict(rf_base_reg_fit, reg_train)`
			`# )`
			`#`
			`# reg_metrics(rf_reg_predict, truth = FT4, estimate = .pred)`
			`#`
			`# rf_reg_tune_model <- p$rand_forest(trees = tune(), mtry = tune(), min_n = tune()) %>%`
			`# p$set_engine("ranger") %>% p$set_mode("regression")`
			`#`
			`# rf_reg_pred <- dplyr::select(reg_train, -FT4, -subject_id, -charttime)`
			`#`
			`# rf_reg_param <- p$extract_parameter_set_dials(rf_reg_tune_model) %>%`
			`# update(mtry = d$finalize(d$mtry(), rf_reg_pred))`
			`#`
			`# data_fold_reg <- rsamp$vfold_cv(reg_train, v = 5)`
			`#`
			`# rf_reg_workflow <- wf$update_model(rf_reg_workflow, rf_reg_tune_model)`
			`#`
			`# # takes around 1 hr to run grid search. saving best params manaually`
			`# # rf_reg_tune <- rf_reg_workflow %>%`
			`# # tune::tune_grid(`
			`# # data_fold_reg`
			`# # ,grid = rf_reg_param %>% d$grid_regular()`
			`# # )`
			`#`
			`# rf_reg_best_params <- tibble::tibble(`
			`# mtry = 8`
			`# ,trees = 1000`
			`# ,min_n = 2`
			`# )`
			`#`
			`# final_rf_reg_workflow <- rf_reg_workflow %>%`
			`# tune::finalize_workflow(rf_reg_best_params)`
			`#`
			`# final_rf_reg_fit <- p$fit(final_rf_reg_workflow, reg_train)`
			`#`
			`# final_rf_reg_predict <- reg_train %>%`
			`# dplyr::select(FT4) %>%`
			`# dplyr::bind_cols(`
			`# predict(final_rf_reg_fit, reg_train)`
			`# )`
			`#`
			`# reg_metrics(final_rf_reg_predict, truth = FT4, estimate = .pred)`