DHSC-Capstone/ML/1-data-exploration.R

114 lines
2.7 KiB
R
Raw Normal View History

2023-01-11 12:04:40 -05:00
2023-01-09 09:07:17 -05:00
rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.
cat("\014") # Clear the console
# load packages -----------------------------------------------------------
box::use(
magrittr[`%>%`]
,here[here]
,dplyr
,readr
,tidyr
2023-01-11 12:48:02 -05:00
,gp2 = ggplot2[ggplot, aes]
2023-01-09 09:07:17 -05:00
)
2023-01-09 09:37:37 -05:00
# globals -----------------------------------------------------------------
2023-01-11 12:04:40 -05:00
test_list_names <- c(
"BUN" = "51006"
,"CA" = "50893"
,"CO2" = "50882"
,"CL" = "50902"
,"CREA" = "50912"
,"GLU" = "50931"
,"K" = "50971"
,"NA" = "50983"
,"TSH" = "50993"
,"FT4" = "50995"
,"RBC" = "51279"
,"WBC" = "51300"
,"HCT" = "51221"
,"HGB" = "51222"
,"PLT" = "51265"
)
2023-01-09 09:37:37 -05:00
2023-01-09 09:07:17 -05:00
# load data ---------------------------------------------------------------
2023-01-11 12:04:40 -05:00
ds_high_tsh_raw <- readr$read_rds(
2023-01-09 09:07:17 -05:00
here("ML","data-unshared","ds_high_tsh.RDS")
)
# data manipulation -------------------------------------------------------
#here I am adding a column to determine if the Free T4 Value is diagnostic or not
# using the FT4 Referance range low as the cut off (0.93)
2023-01-11 12:04:40 -05:00
ds_high_tsh <- ds_high_tsh_raw %>%
dplyr$mutate(ft4_dia = dplyr$if_else(`50995` < 0.93, 1, 0)) %>%
#can rename with a vector using either of these
# dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names))
dplyr$rename(!!!test_list_names)
2023-01-09 09:07:17 -05:00
# basic visualization -----------------------------------------------------
2023-01-11 12:48:02 -05:00
g_count <- dplyr$as_tibble(colSums(is.na(ds_high_tsh)), rownames = NA ) %>%
tibble::rownames_to_column() %>%
ggplot(aes(x = rowname, y = value)) +
gp2$geom_col() +
gp2$theme(
axis.text.x = gp2$element_text(angle = 90)
)
2023-01-09 09:37:37 -05:00
2023-01-11 12:48:02 -05:00
g_count
2023-01-09 09:37:37 -05:00
2023-01-12 15:30:08 -05:00
ds_corr <- cor(ds_high_tsh %>% dplyr$select(-subject_id, - charttime)
%>% dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2)))
,use = "complete.obs")
2023-01-13 07:47:50 -05:00
#code for saving corr plot
png(here("figures","corrplot_high.png"), type = 'cairo')
2023-01-12 15:30:08 -05:00
corrplot::corrplot(ds_corr, method = "number")
2023-01-13 07:47:50 -05:00
dev.off()
2023-01-09 09:37:37 -05:00
2023-01-11 12:48:02 -05:00
#quick recode of gender, will still do recoding during feature engineering
2023-01-09 09:07:17 -05:00
g1 <- ds_high_tsh %>%
dplyr$select(-subject_id, - charttime) %>%
dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) %>%
2023-01-11 12:48:02 -05:00
tidyr$pivot_longer(cols = dplyr$everything()) %>%
ggplot(aes(x = value)) +
gp2$geom_histogram() +
gp2$facet_wrap(~name, scales = "free")
g1
2023-01-11 12:58:06 -05:00
g2 <- ds_high_tsh %>%
dplyr$select(-gender, -subject_id, - charttime) %>%
tidyr$pivot_longer(cols = !ft4_dia) %>%
ggplot(aes(x = factor(ft4_dia), y = value, fill = factor(ft4_dia))) +
gp2$geom_boxplot(outlier.shape = NA, na.rm = TRUE) +
2023-01-12 15:30:08 -05:00
gp2$geom_jitter(size=.7, width=.1, alpha=.5, na.rm = TRUE) +
2023-01-11 12:58:06 -05:00
gp2$facet_wrap(~name, scales = "free")
g2