diff --git a/ML/0-data_prep.R b/ML/0-data_prep.R index 1aaae3b..9b180a6 100644 --- a/ML/0-data_prep.R +++ b/ML/0-data_prep.R @@ -43,6 +43,7 @@ test_list_cmp <- c( ,50995 #Thyroxine (T4), Free ) +# 51301 and 51300 looks like test name may have changed test_list_bmp <- c( 51006 #Urea Nitrogen ,50893 #Calcium, Total @@ -54,7 +55,13 @@ test_list_bmp <- c( ,50983 #Sodium ,50993 #Thyroid Stimulating Hormone ,50995 #Thyroxine (T4), Free -) + ,51279 #RBC + ,51301 #White Blood Cells + ,51300 #WBC Count + ,51221 #Hematocrit + ,51222 #Hemoglobin + ,51265 #Platelet Count + ) # TSH Ref Range from File 0.27 - 4.2 uIU/mL # Free T4 Ref Range from File 0.93 - 1.7 ng/dL @@ -80,28 +87,9 @@ patients <- dplyr$tbl(db, "patients") %>% # dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% # dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) %>% # dplyr$collect() - -ds_cmp <- dplyr$tbl(db, "labevents") %>% - dplyr$filter(itemid %in% test_list_cmp) %>% - dplyr$select(-storetime) %>% - tidyr$pivot_wider( - id_cols = c(subject_id,charttime) - ,names_from = itemid - ,values_from = valuenum - ) %>% - dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% - dplyr$collect() - -#this keeps failing if run as part of the above query. Moving here to keep going -# keeps only rows that have no more then three NA's -ds_cmp <- patients %>% - dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>% - dplyr$filter(rowSums(is.na(.)) <= 3) - - -# No longer using this, but saving incase -# ds_bmp <- dplyr$tbl(db, "labevents") %>% -# dplyr$filter(itemid %in% test_list_bmp) %>% +#No longer using this, but saving incase +# ds_cmp <- dplyr$tbl(db, "labevents") %>% +# dplyr$filter(itemid %in% test_list_cmp) %>% # dplyr$select(-storetime) %>% # tidyr$pivot_wider( # id_cols = c(subject_id,charttime) @@ -111,21 +99,44 @@ ds_cmp <- patients %>% # dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% # dplyr$collect() # -# ds_bmp <- patients %>% -# dplyr$left_join(ds_bmp, by = c("subject_id" = "subject_id")) %>% -# dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) +# #this keeps failing if run as part of the above query. Moving here to keep going +# # keeps only rows that have no more then three NA's +# ds_cmp <- patients %>% +# dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>% +# dplyr$filter(rowSums(is.na(.)) <= 3) + +# BMP and CBC Results together + +ds_bmp <- dplyr$tbl(db, "labevents") %>% + dplyr$filter(itemid %in% test_list_bmp) %>% + dplyr$select(-storetime) %>% + tidyr$pivot_wider( + id_cols = c(subject_id,charttime) + ,names_from = itemid + ,values_from = valuenum + ) %>% + dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% + dplyr$collect() + + +ds_bmp <- ds_bmp %>% + dplyr$left_join(patients, by = c("subject_id" = "subject_id")) %>% + dplyr$mutate(dplyr$across(`51300`, ~dplyr$if_else(!is.na(.),`51300`,`51301`))) %>% + dplyr$select(-`51301`) %>% + # dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) + dplyr$filter(rowSums(is.na(.)) <= 2) #allows for 2 missing test # save data --------------------------------------------------------------- -ds_high_tsh <- ds_cmp %>% +ds_high_tsh <- ds_bmp %>% dplyr$filter(`50993` > 4.2) %>% readr$write_rds( here("ML","data-unshared","ds_high_tsh.RDS") ) -ds_low_tsh <- ds_cmp %>% +ds_low_tsh <- ds_bmp %>% dplyr$filter(`50993` < 0.27) %>% readr$write_rds( here("ML","data-unshared","ds_low_tsh.RDS")