From cbaf895c214649a1e8676fc71c45fbea3f674d61 Mon Sep 17 00:00:00 2001 From: Kyle Belanger Date: Sun, 8 Jan 2023 08:35:25 -0500 Subject: [PATCH] Update 0-data_prep.R --- ML/0-data_prep.R | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/ML/0-data_prep.R b/ML/0-data_prep.R index 88a1796..1aaae3b 100644 --- a/ML/0-data_prep.R +++ b/ML/0-data_prep.R @@ -66,8 +66,6 @@ patients <- dplyr$tbl(db, "patients") %>% dplyr$select(-anchor_year, -anchor_year_group, -dod) %>% dplyr$collect() -# most likely will not use this as there are not as many complete rows. However -# gathering it just in case. # first is using specimen id, usable data set is using chart time as it appears # LIS uses different id's for groups of tests # @@ -95,39 +93,39 @@ ds_cmp <- dplyr$tbl(db, "labevents") %>% dplyr$collect() #this keeps failing if run as part of the above query. Moving here to keep going -# keeps only rows that have values for all columns +# keeps only rows that have no more then three NA's ds_cmp <- patients %>% dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>% - dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) + dplyr$filter(rowSums(is.na(.)) <= 3) - -ds_bmp <- dplyr$tbl(db, "labevents") %>% - dplyr$filter(itemid %in% test_list_bmp) %>% - dplyr$select(-storetime) %>% - tidyr$pivot_wider( - id_cols = c(subject_id,charttime) - ,names_from = itemid - ,values_from = valuenum - ) %>% - dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% - dplyr$collect() - -ds_bmp <- patients %>% - dplyr$left_join(ds_bmp, by = c("subject_id" = "subject_id")) %>% - dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) +# No longer using this, but saving incase +# ds_bmp <- dplyr$tbl(db, "labevents") %>% +# dplyr$filter(itemid %in% test_list_bmp) %>% +# dplyr$select(-storetime) %>% +# tidyr$pivot_wider( +# id_cols = c(subject_id,charttime) +# ,names_from = itemid +# ,values_from = valuenum +# ) %>% +# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% +# dplyr$collect() +# +# ds_bmp <- patients %>% +# dplyr$left_join(ds_bmp, by = c("subject_id" = "subject_id")) %>% +# dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) # save data --------------------------------------------------------------- -ds_high_tsh <- ds_bmp %>% +ds_high_tsh <- ds_cmp %>% dplyr$filter(`50993` > 4.2) %>% readr$write_rds( here("ML","data-unshared","ds_high_tsh.RDS") ) -ds_low_tsh <- ds_bmp %>% +ds_low_tsh <- ds_cmp %>% dplyr$filter(`50993` < 0.27) %>% readr$write_rds( here("ML","data-unshared","ds_low_tsh.RDS")