Update 1-data-exploration.R
This commit is contained in:
parent
4b794c104d
commit
431e519ed1
1 changed files with 24 additions and 12 deletions
|
@ -56,12 +56,14 @@ ds_high_tsh <- ds_high_tsh_raw %>%
|
||||||
dplyr$mutate(ft4_dia = dplyr$if_else(`50995` < 0.93, 1, 0)) %>%
|
dplyr$mutate(ft4_dia = dplyr$if_else(`50995` < 0.93, 1, 0)) %>%
|
||||||
#can rename with a vector using either of these
|
#can rename with a vector using either of these
|
||||||
# dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names))
|
# dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names))
|
||||||
dplyr$rename(!!!test_list_names)
|
dplyr$rename(!!!test_list_names) %>%
|
||||||
|
dplyr$select(-FT4)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# basic visualization -----------------------------------------------------
|
# basic visualization -----------------------------------------------------
|
||||||
|
|
||||||
|
#graph of missing tests
|
||||||
g_count <- dplyr$as_tibble(colSums(is.na(ds_high_tsh)), rownames = NA ) %>%
|
g_count <- dplyr$as_tibble(colSums(is.na(ds_high_tsh)), rownames = NA ) %>%
|
||||||
tibble::rownames_to_column() %>%
|
tibble::rownames_to_column() %>%
|
||||||
ggplot(aes(x = rowname, y = value)) +
|
ggplot(aes(x = rowname, y = value)) +
|
||||||
|
@ -69,9 +71,21 @@ g_count <- dplyr$as_tibble(colSums(is.na(ds_high_tsh)), rownames = NA ) %>%
|
||||||
gp2$theme(
|
gp2$theme(
|
||||||
axis.text.x = gp2$element_text(angle = 90)
|
axis.text.x = gp2$element_text(angle = 90)
|
||||||
)
|
)
|
||||||
|
|
||||||
g_count
|
g_count
|
||||||
|
|
||||||
|
#table of missing tests
|
||||||
|
dplyr$as_tibble(colSums(is.na(ds_high_tsh)), rownames = NA ) %>%
|
||||||
|
tibble::rownames_to_column() %>% knitr::kable()
|
||||||
|
|
||||||
|
# count of diagnostics ft4 and freq
|
||||||
|
t1 <- ds_high_tsh %>%
|
||||||
|
dplyr$count(ft4_dia) %>%
|
||||||
|
dplyr$mutate(freq = n/sum(n)) %>%
|
||||||
|
knitr::kable()
|
||||||
|
|
||||||
|
t1
|
||||||
|
|
||||||
|
# correlation plot
|
||||||
ds_corr <- cor(ds_high_tsh %>% dplyr$select(-subject_id, - charttime)
|
ds_corr <- cor(ds_high_tsh %>% dplyr$select(-subject_id, - charttime)
|
||||||
%>% dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2)))
|
%>% dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2)))
|
||||||
,use = "complete.obs")
|
,use = "complete.obs")
|
||||||
|
@ -79,29 +93,22 @@ ds_corr <- cor(ds_high_tsh %>% dplyr$select(-subject_id, - charttime)
|
||||||
|
|
||||||
#code for saving corr plot
|
#code for saving corr plot
|
||||||
png(here("figures","corrplot_high.png"), type = 'cairo')
|
png(here("figures","corrplot_high.png"), type = 'cairo')
|
||||||
|
|
||||||
corrplot::corrplot(ds_corr, method = "number")
|
corrplot::corrplot(ds_corr, method = "number")
|
||||||
|
|
||||||
dev.off()
|
dev.off()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#quick recode of gender, will still do recoding during feature engineering
|
#quick recode of gender, will still do recoding during feature engineering
|
||||||
g1 <- ds_high_tsh %>%
|
g1 <- ds_high_tsh %>%
|
||||||
dplyr$select(-subject_id, - charttime) %>%
|
dplyr$select(-subject_id, - charttime) %>%
|
||||||
dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) %>%
|
dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) %>%
|
||||||
tidyr$pivot_longer(cols = dplyr$everything()) %>%
|
tidyr$pivot_longer(cols = dplyr$everything()) %>%
|
||||||
ggplot(aes(x = value)) +
|
ggplot(aes(x = value)) +
|
||||||
gp2$geom_histogram() +
|
gp2$geom_histogram(na.rm = TRUE) +
|
||||||
gp2$facet_wrap(~name, scales = "free")
|
gp2$facet_wrap(~name, scales = "free")
|
||||||
g1
|
g1
|
||||||
|
|
||||||
|
|
||||||
|
# this takes a bit to load. No discernable paterns in the data
|
||||||
|
|
||||||
g2 <- ds_high_tsh %>%
|
g2 <- ds_high_tsh %>%
|
||||||
dplyr$select(-gender, -subject_id, - charttime) %>%
|
dplyr$select(-gender, -subject_id, - charttime) %>%
|
||||||
tidyr$pivot_longer(cols = !ft4_dia) %>%
|
tidyr$pivot_longer(cols = !ft4_dia) %>%
|
||||||
|
@ -109,5 +116,10 @@ g2 <- ds_high_tsh %>%
|
||||||
gp2$geom_boxplot(outlier.shape = NA, na.rm = TRUE) +
|
gp2$geom_boxplot(outlier.shape = NA, na.rm = TRUE) +
|
||||||
gp2$geom_jitter(size=.7, width=.1, alpha=.5, na.rm = TRUE) +
|
gp2$geom_jitter(size=.7, width=.1, alpha=.5, na.rm = TRUE) +
|
||||||
gp2$facet_wrap(~name, scales = "free")
|
gp2$facet_wrap(~name, scales = "free")
|
||||||
|
|
||||||
g2
|
g2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue