From 66f213bc5a4fc592baf821429ca0f23a057e6fae Mon Sep 17 00:00:00 2001 From: Kyle Belanger Date: Sat, 21 Jan 2023 07:44:35 -0500 Subject: [PATCH] Update 1-data-exploration.R --- ML/1-data-exploration.R | 112 +++++++++++----------------------------- 1 file changed, 31 insertions(+), 81 deletions(-) diff --git a/ML/1-data-exploration.R b/ML/1-data-exploration.R index a3412f0..ac3742c 100644 --- a/ML/1-data-exploration.R +++ b/ML/1-data-exploration.R @@ -17,36 +17,11 @@ box::use( # globals ----------------------------------------------------------------- -test_list_names <- c( - "BUN" = "51006" - ,"CA" = "50893" - ,"CO2" = "50882" - ,"CL" = "50902" - ,"CREA" = "50912" - ,"GLU" = "50931" - ,"K" = "50971" - ,"NA" = "50983" - ,"TSH" = "50993" - ,"FT4" = "50995" - ,"RBC" = "51279" - ,"WBC" = "51300" - ,"HCT" = "51221" - ,"HGB" = "51222" - ,"PLT" = "51265" -) - # load data --------------------------------------------------------------- -ds_high_tsh_raw <- readr$read_rds( - here("ML","data-unshared","ds_high_tsh.RDS") - ) - -ds_low_tsh_raw <- readr$read_rds( - here("ML","data-unshared","ds_low_tsh.RDS") - ) - +ds0 <- readr$read_rds(here("ML","data-unshared","ds_final.RDS")) # data manipulation ------------------------------------------------------- @@ -54,69 +29,44 @@ ds_low_tsh_raw <- readr$read_rds( # using the FT4 Referance range low as the cut off (0.93) -ds_high_tsh <- ds_high_tsh_raw %>% - dplyr$mutate(ft4_dia = dplyr$if_else(`50995` < 0.93, TRUE, FALSE)) %>% - #can rename with a vector using either of these - # dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names)) - dplyr$rename(!!!test_list_names) %>% +ds1 <- ds0 %>% dplyr$select(-FT4, -subject_id, -charttime) %>% - dplyr$relocate(gender, anchor_age) - - -ds_low_tsh <- ds_low_tsh_raw %>% - dplyr$mutate(ft4_dia = dplyr$if_else(`50995` > 1.7, TRUE, FALSE)) %>% - #can rename with a vector using either of these - # dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names)) - dplyr$rename(!!!test_list_names) %>% - dplyr$select(-FT4, -subject_id, -charttime) %>% - dplyr$relocate(gender, anchor_age) - + dplyr$mutate(dplyr$across( + ft4_dia + , ~factor(., levels = c("Hypo", "Non-Hypo", "Normal TSH", "Hyper", "Non-Hyper") + ) + ) + ) # basic visualization ----------------------------------------------------- #summary Table -#use this instead of making myself -summary_table <- function(ds){ - table <- ds %>% - gtsummary$tbl_summary( - by = ft4_dia - ,missing = "no" - ,type = gtsummary$all_continuous() ~ "continuous2" - ,label = list( - gender ~ "Gender" - ,anchor_age ~ "Age" - ) - ,statistic = gtsummary$all_continuous() ~ c( - "{N_miss}" - ,"{median} ({p25}, {p75})" - ,"{min}, {max}" - ) - ) %>% - gtsummary$bold_labels() %>% - gtsummary$add_stat_label( - label = gtsummary$all_continuous() ~ c("Missing", "Median (IQR)", "Range") - ) %>% - gtsummary$modify_header(label = "**Variable**") %>% - gtsummary$modify_spanning_header(gtsummary$all_stat_cols() ~ "**Free T4 Diagnostic**") - return(table) - - } - -# create both tables -high_table_summary <- summary_table(ds_high_tsh) -low_table_summary <- summary_table(ds_low_tsh) - -# merge tables -merged_summary_table <- gtsummary$tbl_merge( - tbls = list(high_table_summary, low_table_summary) - ,tab_spanner = c( - "**Elevated TSH** \n Free T4 Diagnostic" - ,"**Decreased TSH** \n Free T4 Diagnostic" - ) +summary_tbl <- ds1 %>% + gtsummary$tbl_summary( + by = ft4_dia + ,missing = "no" + ,type = gtsummary$all_continuous() ~ "continuous2" + ,label = list( + gender ~ "Gender" + ,anchor_age ~ "Age" + ) + ,statistic = gtsummary$all_continuous() ~ c( + "{N_miss}" + ,"{median} ({p25}, {p75})" + ,"{min}, {max}" + ) + ) %>% + gtsummary$bold_labels() %>% + gtsummary$add_stat_label( + label = gtsummary$all_continuous() ~ c("Missing", "Median (IQR)", "Range") ) %>% - gtsummary$as_flex_table() + gtsummary$modify_header(label = "**Variable**") %>% + gtsummary$modify_spanning_header(gtsummary$all_stat_cols() ~ "**Free T4 Diagnostic**") + +# summary_tbl + # correlation plot