DHSC-Capstone/ML/0-data_prep.R

133 lines
3.5 KiB
R
Raw Normal View History

2023-01-05 08:18:18 -05:00
rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.
cat("\014") # Clear the console
# load packages -----------------------------------------------------------
box::use(
magrittr[`%>%`]
,RSQLite
,DBI[dbConnect,dbDisconnect]
,here[here]
2023-01-05 15:27:20 -05:00
,dplyr
2023-01-06 07:40:03 -05:00
,dbplyr
,tidyr
2023-01-05 08:18:18 -05:00
)
# globals -----------------------------------------------------------------
db <- dbConnect(
RSQLite$SQLite()
,here("ML","data-unshared","mimicDB.sqlite")
)
2023-01-06 08:46:52 -05:00
#item list shows two different numbers for a few tests, second set of items do not have
# any results that are on the same samples as TSH and Free T4
2023-01-06 14:48:53 -05:00
test_list_cmp <- c(
2023-01-05 15:27:20 -05:00
50862 #Albumin
,50863 #Alkaline Phosphatase
,50861 #Alanine Aminotransferase (ALT)
,50878 #Asparate Aminotransferase (AST)
,51006 #Urea Nitrogen
,50893 #Calcium, Total
,50882 #Bicarbonate
,50902 #Chloride
,50912 #Creatinine
,50931 #Glucose
,50971 #Potassium
,50983 #Sodium
,50885 #Bilirubin, Total
,50976 #Protein, Total
,50993 #Thyroid Stimulating Hormone
2023-01-06 08:46:52 -05:00
,50995 #Thyroxine (T4), Free
2023-01-05 15:27:20 -05:00
)
2023-01-05 08:18:18 -05:00
2023-01-06 14:48:53 -05:00
test_list_bmp <- c(
51006 #Urea Nitrogen
,50893 #Calcium, Total
,50882 #Bicarbonate
,50902 #Chloride
,50912 #Creatinine
,50931 #Glucose
,50971 #Potassium
,50983 #Sodium
,50993 #Thyroid Stimulating Hormone
,50995 #Thyroxine (T4), Free
)
2023-01-05 08:18:18 -05:00
2023-01-06 14:48:53 -05:00
# TSH Ref Range from File 0.27 - 4.2 uIU/mL
# Free T4 Ref Range from File 0.93 - 1.7 ng/dL
2023-01-05 08:18:18 -05:00
2023-01-05 15:27:20 -05:00
# load data ---------------------------------------------------------------
2023-01-05 08:18:18 -05:00
2023-01-06 15:48:47 -05:00
# load patients first to add to lab values
patients <- dplyr$tbl(db, "patients") %>%
dplyr$select(-anchor_year, -anchor_year_group, -dod) %>%
dplyr$collect()
#this function is failing if run as part of the DB query
# Recoding Male = 1, Female = 2
patients <- patients %>%
dplyr$mutate(dplyr$across(gender, ~dplyr$recode(gender, "M" = 1, "F" = 2)))
2023-01-06 14:48:53 -05:00
# most likely will not use this as there are not as many complete rows. However
# gathering it just in case.
# first is using specimen id, usable data set is using chart time as it appears
# LIS uses different id's for groups of tests
#
# ds_cmp <- dplyr$tbl(db, "labevents") %>%
# dplyr$filter(itemid %in% test_list_cmp) %>%
# dplyr$select(-charttime,-storetime) %>%
# tidyr$pivot_wider(
# id_cols = c(subject_id,specimen_id)
# ,names_from = itemid
# ,values_from = valuenum
# ) %>%
# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
# dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) %>%
# dplyr$collect()
ds_cmp <- dplyr$tbl(db, "labevents") %>%
dplyr$filter(itemid %in% test_list_cmp) %>%
dplyr$select(-storetime) %>%
2023-01-06 07:40:03 -05:00
tidyr$pivot_wider(
2023-01-06 14:48:53 -05:00
id_cols = c(subject_id,charttime)
2023-01-06 07:40:03 -05:00
,names_from = itemid
,values_from = valuenum
2023-01-06 14:48:53 -05:00
) %>%
2023-01-06 08:46:52 -05:00
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
2023-01-06 07:40:03 -05:00
dplyr$collect()
2023-01-06 14:48:53 -05:00
#this keeps failing if run as part of the above query. Moving here to keep going
# keeps only rows that have values for all columns
2023-01-06 15:49:53 -05:00
ds_cmp <- patients %>%
dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>%
dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
2023-01-06 08:46:52 -05:00
2023-01-06 14:48:53 -05:00
ds_bmp <- dplyr$tbl(db, "labevents") %>%
dplyr$filter(itemid %in% test_list_bmp) %>%
2023-01-06 08:46:52 -05:00
dplyr$select(-storetime) %>%
tidyr$pivot_wider(
id_cols = c(subject_id,charttime)
,names_from = itemid
,values_from = valuenum
) %>%
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
dplyr$collect()
2023-01-06 15:49:53 -05:00
ds_bmp <- patients %>%
dplyr$left_join(ds_bmp, by = c("subject_id" = "subject_id")) %>%
dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
2023-01-05 08:18:18 -05:00
2023-01-06 08:49:22 -05:00
2023-01-06 15:48:47 -05:00
2023-01-05 08:18:18 -05:00
# close database ----------------------------------------------------------
dbDisconnect(db)