DHSC-Capstone/ML/0-data_prep.R
2023-01-06 08:49:22 -05:00

97 lines
2.4 KiB
R

rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.
cat("\014") # Clear the console
# load packages -----------------------------------------------------------
box::use(
magrittr[`%>%`]
,RSQLite
,DBI[dbConnect,dbDisconnect]
,here[here]
,dplyr
,dbplyr
,tidyr
)
# globals -----------------------------------------------------------------
db <- dbConnect(
RSQLite$SQLite()
,here("ML","data-unshared","mimicDB.sqlite")
)
#item list shows two different numbers for a few tests, second set of items do not have
# any results that are on the same samples as TSH and Free T4
test_list <- c(
50862 #Albumin
,50863 #Alkaline Phosphatase
,50861 #Alanine Aminotransferase (ALT)
,50878 #Asparate Aminotransferase (AST)
,51006 #Urea Nitrogen
,50893 #Calcium, Total
,50882 #Bicarbonate
,50902 #Chloride
,50912 #Creatinine
,50931 #Glucose
,50971 #Potassium
,50983 #Sodium
,50885 #Bilirubin, Total
,50976 #Protein, Total
,50993 #Thyroid Stimulating Hormone
,50995 #Thyroxine (T4), Free
)
# load data ---------------------------------------------------------------
ds <- dplyr$tbl(db, "labevents") %>%
dplyr$filter(itemid %in% test_list) %>%
dplyr$select(-charttime,-storetime) %>%
tidyr$pivot_wider(
id_cols = c(subject_id,specimen_id)
,names_from = itemid
,values_from = valuenum
) %>%
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
dplyr$collect()
ds %>% dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x)))
count <- data.frame(colSums(is.na(ds))) %>% tibble::rownames_to_column()
testds <- readr::read_csv(
here("ML","data-unshared", "labevents.csv")
,col_types = "_d_ddTT_d______"
,n_max = 100
)
#using chart time instead of spceimen id results in less NA values.
# total protien still have very low resulting
ds1 <- dplyr$tbl(db, "labevents") %>%
dplyr$filter(itemid %in% test_list) %>%
dplyr$select(-storetime) %>%
tidyr$pivot_wider(
id_cols = c(subject_id,charttime)
,names_from = itemid
,values_from = valuenum
) %>%
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
dplyr$collect()
count2 <- data.frame(colSums(is.na(ds1))) %>% tibble::rownames_to_column()
counts <- count %>%
dplyr$left_join(count2)
# using charttime, total of 5,424 rows with all values filled in
ds1 %>% dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x)))
# close database ----------------------------------------------------------
dbDisconnect(db)