DHSC-Capstone/ML/1-data-exploration.R

109 lines
2.7 KiB
R
Raw Normal View History

2023-01-09 09:07:17 -05:00
rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.
cat("\014") # Clear the console
# load packages -----------------------------------------------------------
box::use(
magrittr[`%>%`]
,here[here]
,dplyr
,readr
,tidyr
2023-01-11 12:48:02 -05:00
,gp2 = ggplot2[ggplot, aes]
2023-01-19 07:49:38 -05:00
,gtsummary
2023-01-09 09:07:17 -05:00
)
2023-01-09 09:37:37 -05:00
# globals -----------------------------------------------------------------
2023-01-09 09:07:17 -05:00
# load data ---------------------------------------------------------------
2023-01-21 07:44:35 -05:00
ds0 <- readr$read_rds(here("ML","data-unshared","ds_final.RDS"))
2023-01-09 09:07:17 -05:00
# data manipulation -------------------------------------------------------
#here I am adding a column to determine if the Free T4 Value is diagnostic or not
# using the FT4 Referance range low as the cut off (0.93)
2023-01-21 07:44:35 -05:00
ds1 <- ds0 %>%
2023-01-19 07:49:38 -05:00
dplyr$select(-FT4, -subject_id, -charttime) %>%
2023-01-21 07:44:35 -05:00
dplyr$mutate(dplyr$across(
ft4_dia
, ~factor(., levels = c("Hypo", "Non-Hypo", "Normal TSH", "Hyper", "Non-Hyper")
)
)
)
2023-01-09 09:07:17 -05:00
2023-01-18 19:45:10 -05:00
2023-01-09 09:07:17 -05:00
# basic visualization -----------------------------------------------------
2023-01-19 07:49:38 -05:00
#summary Table
2023-01-21 07:44:35 -05:00
summary_tbl <- ds1 %>%
gtsummary$tbl_summary(
by = ft4_dia
,missing = "no"
,type = gtsummary$all_continuous() ~ "continuous2"
,label = list(
gender ~ "Gender"
,anchor_age ~ "Age"
)
,statistic = gtsummary$all_continuous() ~ c(
"{N_miss}"
,"{median} ({p25}, {p75})"
,"{min}, {max}"
)
) %>%
gtsummary$bold_labels() %>%
gtsummary$add_stat_label(
label = gtsummary$all_continuous() ~ c("Missing", "Median (IQR)", "Range")
2023-01-20 09:29:54 -05:00
) %>%
2023-01-21 07:44:35 -05:00
gtsummary$modify_header(label = "**Variable**") %>%
gtsummary$modify_spanning_header(gtsummary$all_stat_cols() ~ "**Free T4 Diagnostic**")
# summary_tbl
2023-01-20 09:29:54 -05:00
2023-01-15 08:14:52 -05:00
# correlation plot
2023-01-20 09:29:54 -05:00
ds_corr <- cor(ds_high_tsh %>%
dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2)))
2023-01-12 15:30:08 -05:00
,use = "complete.obs")
2023-01-13 07:47:50 -05:00
#code for saving corr plot
png(here("figures","corrplot_high.png"), type = 'cairo')
2023-01-12 15:30:08 -05:00
corrplot::corrplot(ds_corr, method = "number")
2023-01-13 07:47:50 -05:00
dev.off()
2023-01-11 12:48:02 -05:00
#quick recode of gender, will still do recoding during feature engineering
2023-01-09 09:07:17 -05:00
g1 <- ds_high_tsh %>%
dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) %>%
2023-01-11 12:48:02 -05:00
tidyr$pivot_longer(cols = dplyr$everything()) %>%
ggplot(aes(x = value)) +
2023-01-15 08:14:52 -05:00
gp2$geom_histogram(na.rm = TRUE) +
2023-01-11 12:48:02 -05:00
gp2$facet_wrap(~name, scales = "free")
g1
2023-01-11 12:58:06 -05:00
2023-01-15 08:14:52 -05:00
# this takes a bit to load. No discernable paterns in the data
2023-01-11 12:58:06 -05:00
g2 <- ds_high_tsh %>%
2023-01-20 09:29:54 -05:00
dplyr$select(-gender) %>%
2023-01-11 12:58:06 -05:00
tidyr$pivot_longer(cols = !ft4_dia) %>%
ggplot(aes(x = factor(ft4_dia), y = value, fill = factor(ft4_dia))) +
gp2$geom_boxplot(outlier.shape = NA, na.rm = TRUE) +
2023-01-12 15:30:08 -05:00
gp2$geom_jitter(size=.7, width=.1, alpha=.5, na.rm = TRUE) +
2023-01-11 12:58:06 -05:00
gp2$facet_wrap(~name, scales = "free")
g2
2023-01-15 08:14:52 -05:00