From cbaf895c214649a1e8676fc71c45fbea3f674d61 Mon Sep 17 00:00:00 2001
From: Kyle Belanger <kyleb44@hotmail.com>
Date: Sun, 8 Jan 2023 08:35:25 -0500
Subject: [PATCH] Update 0-data_prep.R

---
 ML/0-data_prep.R | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/ML/0-data_prep.R b/ML/0-data_prep.R
index 88a1796..1aaae3b 100644
--- a/ML/0-data_prep.R
+++ b/ML/0-data_prep.R
@@ -66,8 +66,6 @@ patients <- dplyr$tbl(db, "patients") %>%
   dplyr$select(-anchor_year, -anchor_year_group, -dod) %>%
   dplyr$collect()
 
-# most likely will not use this as there are not as many complete rows.  However
-# gathering it just in case.
 # first is using specimen id, usable data set is using chart time as it appears
 # LIS uses different id's for groups of tests
 #
@@ -95,39 +93,39 @@ ds_cmp <- dplyr$tbl(db, "labevents") %>%
   dplyr$collect()
 
 #this keeps failing if run as part of the above query.  Moving here to keep going
-# keeps only rows that have values for all columns
+# keeps only rows that have no more then three NA's
 ds_cmp <- patients %>%
   dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>%
-  dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
+  dplyr$filter(rowSums(is.na(.)) <= 3)
 
 
-
-ds_bmp <- dplyr$tbl(db, "labevents") %>%
-  dplyr$filter(itemid %in% test_list_bmp) %>%
-  dplyr$select(-storetime) %>%
-  tidyr$pivot_wider(
-    id_cols = c(subject_id,charttime)
-    ,names_from = itemid
-    ,values_from = valuenum
-  ) %>%
-  dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
-  dplyr$collect()
-
-ds_bmp <- patients %>%
-  dplyr$left_join(ds_bmp, by = c("subject_id" = "subject_id")) %>%
-  dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
+# No longer using this, but saving incase
+# ds_bmp <- dplyr$tbl(db, "labevents") %>%
+#   dplyr$filter(itemid %in% test_list_bmp) %>%
+#   dplyr$select(-storetime) %>%
+#   tidyr$pivot_wider(
+#     id_cols = c(subject_id,charttime)
+#     ,names_from = itemid
+#     ,values_from = valuenum
+#   ) %>%
+#   dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
+#   dplyr$collect()
+#
+# ds_bmp <- patients %>%
+#   dplyr$left_join(ds_bmp, by = c("subject_id" = "subject_id")) %>%
+#   dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
 
 
 # save data ---------------------------------------------------------------
 
 
-ds_high_tsh <- ds_bmp %>%
+ds_high_tsh <- ds_cmp %>%
   dplyr$filter(`50993` > 4.2) %>%
   readr$write_rds(
     here("ML","data-unshared","ds_high_tsh.RDS")
   )
 
-ds_low_tsh <- ds_bmp %>%
+ds_low_tsh <- ds_cmp %>%
   dplyr$filter(`50993` < 0.27) %>%
   readr$write_rds(
     here("ML","data-unshared","ds_low_tsh.RDS")