From c7aa0b2da110fab9991dda2213992e1cf4060a86 Mon Sep 17 00:00:00 2001 From: Kyle Belanger Date: Tue, 31 Jan 2023 19:33:24 -0500 Subject: [PATCH] update data --- ML/0-data_prep.R | 2 +- ML/1-data-exploration.R | 8 ++++---- ML/2-modeling.R | 26 ++++++++++++-------------- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/ML/0-data_prep.R b/ML/0-data_prep.R index 334a8e3..80967c9 100644 --- a/ML/0-data_prep.R +++ b/ML/0-data_prep.R @@ -107,7 +107,7 @@ ds_final <- ds1 %>% ) %>% dplyr$filter(ft4_dia != "Normal TSH") %>% dplyr$rename(!!!test_list_names) %>% - dplyr$select(-FT4) %>% + # dplyr$select(-FT4) %>% dplyr$relocate(gender, anchor_age) # save data --------------------------------------------------------------- diff --git a/ML/1-data-exploration.R b/ML/1-data-exploration.R index a2863ef..c139961 100644 --- a/ML/1-data-exploration.R +++ b/ML/1-data-exploration.R @@ -60,7 +60,7 @@ ds_recode <- ds1 %>% #summary Table summary_tbl <- ds1 %>% - dplyr$select(-subject_id, -charttime) %>% + dplyr$select(-subject_id, -charttime, -FT4) %>% gtsummary$tbl_summary( by = ft4_dia ,missing = "no" @@ -83,7 +83,7 @@ summary_tbl <- ds1 %>% # corr-plot --------------------------------------------------------------- corr_plot <- ds1 %>% - dplyr$select(-gender,-ft4_dia, -subject_id, -charttime) %>% + dplyr$select(-gender,-ft4_dia, -subject_id, -charttime, -FT4) %>% dplyr$rename(Age = anchor_age) %>% GGally$ggcorr(nbreaks = 5, palette = "Greys" ,label = TRUE, label_size = 3, label_color = "white" @@ -110,7 +110,7 @@ gp2$ggsave( #quick recode of gender, will still do recoding during feature engineering g1 <- ds1 %>% - dplyr$select(-gender,-ft4_dia, -subject_id, -charttime) %>% + dplyr$select(-gender,-ft4_dia, -subject_id, -charttime, -FT4) %>% tidyr$pivot_longer(cols = dplyr$everything()) %>% ggplot(aes(x = value)) + gp2$geom_histogram(na.rm = TRUE) + @@ -139,7 +139,7 @@ gp2$ggsave( # this takes a bit to load. No discernible patterns in the data g2 <- ds_recode %>% - dplyr$select(-gender, -subject_id, -charttime) %>% + dplyr$select(-gender, -subject_id, -charttime, -FT4) %>% dplyr$mutate(dplyr$across(-ft4_dia, log)) %>% tidyr$pivot_longer(cols = !ft4_dia) %>% ggplot(aes(x = factor(ft4_dia), y = value, fill = factor(ft4_dia))) + diff --git a/ML/2-modeling.R b/ML/2-modeling.R index 07b41cd..7159f27 100644 --- a/ML/2-modeling.R +++ b/ML/2-modeling.R @@ -51,12 +51,13 @@ table(ds_test$ft4_dia) %>% prop.table() rf_model <- p$rand_forest(trees = 1900) %>% - p$set_engine("ranger") %>% p$set_mode("classification") + p$set_engine("ranger") %>% p$set_mode("regression") -rf_recipe <- r$recipe(ft4_dia ~ . , data = ds_train) %>% +rf_recipe <- r$recipe(FT4 ~ . , data = ds_train) %>% r$update_role(subject_id, new_role = "id") %>% r$update_role(charttime, new_role = "time") %>% - r$step_impute_bag(r$all_numeric()) + r$update_role(ft4_dia, new_role = "class") %>% + r$step_impute_bag(r$all_predictors()) @@ -67,18 +68,15 @@ rf_workflow <- wf$workflow() %>% rf_fit <- p$fit(rf_workflow, ds_train) rf_predict <- ds_train %>% - dplyr::select(ft4_dia) %>% - dplyr::bind_cols( - predict(rf_fit, ds_train) - ,predict(rf_fit, ds_train, type = "prob") - ) + dplyr::select(FT4) %>% + dplyr::bind_cols(predict(rf_fit, ds_train)) -ys$accuracy(rf_predict, ft4_dia, .pred_class) -ys$conf_mat(rf_predict, ft4_dia, .pred_class) -ggplot2::autoplot(ys$roc_curve(rf_predict, ft4_dia, `.pred_Hypo`:`.pred_Non-Hyper`)) +gp2$ggplot(rf_predict, gp2$aes(x = FT4, y = .pred)) + + gp2$geom_point() -ys$sensitivity(rf_predict,ft4_dia, .pred_class, estimator = "macro_weighted") -ys$sensitivity(rf_predict,ft4_dia, .pred_class, estimator = "micro") -ys$roc_auc(rf_predict, ft4_dia, `.pred_Hypo`, `.pred_Non-Hypo`, `.pred_Hyper`, `.pred_Non-Hyper`) +ys$rmse(rf_predict, FT4, .pred) +metrics <- ys$metric_set(ys$rmse, ys$rsq, ys$mae) + +metrics(rf_predict, FT4, .pred)