From c331ab3f38296685e5b03ca0bd9edb8fd8bb6c27 Mon Sep 17 00:00:00 2001 From: Kyle Belanger Date: Thu, 19 Jan 2023 07:49:38 -0500 Subject: [PATCH] Update 1-data-exploration.R --- ML/1-data-exploration.R | 66 ++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/ML/1-data-exploration.R b/ML/1-data-exploration.R index 1113c97..243a945 100644 --- a/ML/1-data-exploration.R +++ b/ML/1-data-exploration.R @@ -11,6 +11,7 @@ box::use( ,readr ,tidyr ,gp2 = ggplot2[ggplot, aes] + ,gtsummary ) @@ -54,64 +55,47 @@ ds_low_tsh_raw <- readr$read_rds( ds_high_tsh <- ds_high_tsh_raw %>% - dplyr$mutate(ft4_dia = dplyr$if_else(`50995` < 0.93, 1, 0)) %>% + dplyr$mutate(ft4_dia = dplyr$if_else(`50995` < 0.93, TRUE, FALSE)) %>% #can rename with a vector using either of these # dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names)) dplyr$rename(!!!test_list_names) %>% - dplyr$select(-FT4) + dplyr$select(-FT4, -subject_id, -charttime) %>% + dplyr$relocate(gender, anchor_age) ds_low_tsh <- ds_low_tsh_raw %>% - dplyr$mutate(ft4_dia = dplyr$if_else(`50995` > 1.7, 1, 0)) %>% + dplyr$mutate(ft4_dia = dplyr$if_else(`50995` > 1.7, TRUE, FALSE)) %>% #can rename with a vector using either of these # dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names)) dplyr$rename(!!!test_list_names) %>% - dplyr$select(-FT4) + dplyr$select(-FT4, -subject_id, -charttime) %>% + dplyr$relocate(gender, anchor_age) # basic visualization ----------------------------------------------------- -#graph and table of missing tests - -missing_count <- function(ds){ - - df <- dplyr$as_tibble(colSums(is.na(ds)), rownames = NA ) %>% - tibble::rownames_to_column() - - graph <- df %>% - ggplot(aes(x = rowname, y = value)) + - gp2$geom_col() + - gp2$theme( - axis.text.x = gp2$element_text(angle = 90) - ) - - return( - list( - df = df - ,graph = graph - ) - ) - -} - -high_missing <- missing_count(ds_high_tsh) -low_missing <- missing_count(ds_low_tsh) - -missing_table <- high_missing$df %>% - dplyr$left_join(low_missing$df, by = "rowname") - +#summary Table #use this instead of making myself -ds_high_tsh %>% gtsummary::tbl_summary(by = ft4_dia) +high_table_summary <- ds_high_tsh %>% + gtsummary$tbl_summary( + by = ft4_dia + ,missing = "no" + ,label = list( + gender ~ "Gender" + ,anchor_age ~ "Age" + ) + ,statistic = list( + gtsummary$all_continuous() ~ "{median} ({min}-{max}) + /n Missing: {N_miss} " + ) + ) %>% + gtsummary$add_p() %>% + gtsummary$modify_header(label = "**Variable**") %>% + gtsummary$modify_spanning_header(gtsummary$all_stat_cols() ~ "**Free T4 Diagnostic**") +high_table_summary -# count of diagnostics ft4 and freq -t1 <- ds_high_tsh %>% - dplyr$count(ft4_dia) %>% - dplyr$mutate(freq = n/sum(n)) %>% - knitr::kable() - -t1 # correlation plot ds_corr <- cor(ds_high_tsh %>% dplyr$select(-subject_id, - charttime)