Update 1-data-exploration.R

This commit is contained in:
Kyle Belanger 2023-01-20 09:29:54 -05:00
parent eb770c9d77
commit 1cefc2c306

View file

@ -77,7 +77,8 @@ ds_low_tsh <- ds_low_tsh_raw %>%
#summary Table #summary Table
#use this instead of making myself #use this instead of making myself
high_table_summary <- ds_high_tsh %>% summary_table <- function(ds){
table <- ds %>%
gtsummary$tbl_summary( gtsummary$tbl_summary(
by = ft4_dia by = ft4_dia
,missing = "no" ,missing = "no"
@ -87,9 +88,9 @@ high_table_summary <- ds_high_tsh %>%
,anchor_age ~ "Age" ,anchor_age ~ "Age"
) )
,statistic = gtsummary$all_continuous() ~ c( ,statistic = gtsummary$all_continuous() ~ c(
"{N_miss}", "{N_miss}"
"{median} ({p25}, {p75})", ,"{median} ({p25}, {p75})"
"{min}, {max}" ,"{min}, {max}"
) )
) %>% ) %>%
gtsummary$bold_labels() %>% gtsummary$bold_labels() %>%
@ -99,12 +100,32 @@ high_table_summary <- ds_high_tsh %>%
gtsummary$modify_header(label = "**Variable**") %>% gtsummary$modify_header(label = "**Variable**") %>%
gtsummary$modify_spanning_header(gtsummary$all_stat_cols() ~ "**Free T4 Diagnostic**") gtsummary$modify_spanning_header(gtsummary$all_stat_cols() ~ "**Free T4 Diagnostic**")
high_table_summary return(table)
}
# create both tables
high_table_summary <- summary_table(ds_high_tsh)
low_table_summary <- summary_table(ds_low_tsh)
# merge tables
merged_summary_table <- gtsummary$tbl_merge(
tbls = list(high_table_summary, low_table_summary)
,tab_spanner = c(
"**Elevated TSH** \n Free T4 Diagnostic"
,"**Decreased TSH** \n Free T4 Diagnostic"
)
) %>%
gtsummary$as_flex_table()
# correlation plot # correlation plot
ds_corr <- cor(ds_high_tsh %>% dplyr$select(-subject_id, - charttime) ds_corr <- cor(ds_high_tsh %>%
%>% dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2)))
,use = "complete.obs") ,use = "complete.obs")
@ -116,7 +137,6 @@ dev.off()
#quick recode of gender, will still do recoding during feature engineering #quick recode of gender, will still do recoding during feature engineering
g1 <- ds_high_tsh %>% g1 <- ds_high_tsh %>%
dplyr$select(-subject_id, - charttime) %>%
dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) %>% dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) %>%
tidyr$pivot_longer(cols = dplyr$everything()) %>% tidyr$pivot_longer(cols = dplyr$everything()) %>%
ggplot(aes(x = value)) + ggplot(aes(x = value)) +
@ -127,7 +147,7 @@ g1
# this takes a bit to load. No discernable paterns in the data # this takes a bit to load. No discernable paterns in the data
g2 <- ds_high_tsh %>% g2 <- ds_high_tsh %>%
dplyr$select(-gender, -subject_id, - charttime) %>% dplyr$select(-gender) %>%
tidyr$pivot_longer(cols = !ft4_dia) %>% tidyr$pivot_longer(cols = !ft4_dia) %>%
ggplot(aes(x = factor(ft4_dia), y = value, fill = factor(ft4_dia))) + ggplot(aes(x = factor(ft4_dia), y = value, fill = factor(ft4_dia))) +
gp2$geom_boxplot(outlier.shape = NA, na.rm = TRUE) + gp2$geom_boxplot(outlier.shape = NA, na.rm = TRUE) +