Update 0-data_prep.R

This commit is contained in:
Kyle Belanger 2023-01-09 09:07:14 -05:00
parent cbaf895c21
commit b19d1f25eb

View file

@ -43,6 +43,7 @@ test_list_cmp <- c(
,50995 #Thyroxine (T4), Free
)
# 51301 and 51300 looks like test name may have changed
test_list_bmp <- c(
51006 #Urea Nitrogen
,50893 #Calcium, Total
@ -54,7 +55,13 @@ test_list_bmp <- c(
,50983 #Sodium
,50993 #Thyroid Stimulating Hormone
,50995 #Thyroxine (T4), Free
)
,51279 #RBC
,51301 #White Blood Cells
,51300 #WBC Count
,51221 #Hematocrit
,51222 #Hemoglobin
,51265 #Platelet Count
)
# TSH Ref Range from File 0.27 - 4.2 uIU/mL
# Free T4 Ref Range from File 0.93 - 1.7 ng/dL
@ -80,28 +87,9 @@ patients <- dplyr$tbl(db, "patients") %>%
# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
# dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) %>%
# dplyr$collect()
ds_cmp <- dplyr$tbl(db, "labevents") %>%
dplyr$filter(itemid %in% test_list_cmp) %>%
dplyr$select(-storetime) %>%
tidyr$pivot_wider(
id_cols = c(subject_id,charttime)
,names_from = itemid
,values_from = valuenum
) %>%
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
dplyr$collect()
#this keeps failing if run as part of the above query. Moving here to keep going
# keeps only rows that have no more then three NA's
ds_cmp <- patients %>%
dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>%
dplyr$filter(rowSums(is.na(.)) <= 3)
# No longer using this, but saving incase
# ds_bmp <- dplyr$tbl(db, "labevents") %>%
# dplyr$filter(itemid %in% test_list_bmp) %>%
#No longer using this, but saving incase
# ds_cmp <- dplyr$tbl(db, "labevents") %>%
# dplyr$filter(itemid %in% test_list_cmp) %>%
# dplyr$select(-storetime) %>%
# tidyr$pivot_wider(
# id_cols = c(subject_id,charttime)
@ -111,21 +99,44 @@ ds_cmp <- patients %>%
# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
# dplyr$collect()
#
# ds_bmp <- patients %>%
# dplyr$left_join(ds_bmp, by = c("subject_id" = "subject_id")) %>%
# dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
# #this keeps failing if run as part of the above query. Moving here to keep going
# # keeps only rows that have no more then three NA's
# ds_cmp <- patients %>%
# dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>%
# dplyr$filter(rowSums(is.na(.)) <= 3)
# BMP and CBC Results together
ds_bmp <- dplyr$tbl(db, "labevents") %>%
dplyr$filter(itemid %in% test_list_bmp) %>%
dplyr$select(-storetime) %>%
tidyr$pivot_wider(
id_cols = c(subject_id,charttime)
,names_from = itemid
,values_from = valuenum
) %>%
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
dplyr$collect()
ds_bmp <- ds_bmp %>%
dplyr$left_join(patients, by = c("subject_id" = "subject_id")) %>%
dplyr$mutate(dplyr$across(`51300`, ~dplyr$if_else(!is.na(.),`51300`,`51301`))) %>%
dplyr$select(-`51301`) %>%
# dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
dplyr$filter(rowSums(is.na(.)) <= 2) #allows for 2 missing test
# save data ---------------------------------------------------------------
ds_high_tsh <- ds_cmp %>%
ds_high_tsh <- ds_bmp %>%
dplyr$filter(`50993` > 4.2) %>%
readr$write_rds(
here("ML","data-unshared","ds_high_tsh.RDS")
)
ds_low_tsh <- ds_cmp %>%
ds_low_tsh <- ds_bmp %>%
dplyr$filter(`50993` < 0.27) %>%
readr$write_rds(
here("ML","data-unshared","ds_low_tsh.RDS")