diff --git a/ML/0-data_prep.R b/ML/0-data_prep.R index f3d09b0..1388461 100644 --- a/ML/0-data_prep.R +++ b/ML/0-data_prep.R @@ -23,7 +23,7 @@ db <- dbConnect( #item list shows two different numbers for a few tests, second set of items do not have # any results that are on the same samples as TSH and Free T4 -test_list <- c( +test_list_cmp <- c( 50862 #Albumin ,50863 #Alkaline Phosphatase ,50861 #Alanine Aminotransferase (ALT) @@ -42,39 +42,43 @@ test_list <- c( ,50995 #Thyroxine (T4), Free ) +test_list_bmp <- c( + 51006 #Urea Nitrogen + ,50893 #Calcium, Total + ,50882 #Bicarbonate + ,50902 #Chloride + ,50912 #Creatinine + ,50931 #Glucose + ,50971 #Potassium + ,50983 #Sodium + ,50993 #Thyroid Stimulating Hormone + ,50995 #Thyroxine (T4), Free +) +# TSH Ref Range from File 0.27 - 4.2 uIU/mL +# Free T4 Ref Range from File 0.93 - 1.7 ng/dL # load data --------------------------------------------------------------- -ds <- dplyr$tbl(db, "labevents") %>% - dplyr$filter(itemid %in% test_list) %>% - dplyr$select(-charttime,-storetime) %>% - tidyr$pivot_wider( - id_cols = c(subject_id,specimen_id) - ,names_from = itemid - ,values_from = valuenum - ) %>% - dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% - dplyr$collect() +# most likely will not use this as there are not as many complete rows. However +# gathering it just in case. +# first is using specimen id, usable data set is using chart time as it appears +# LIS uses different id's for groups of tests +# +# ds_cmp <- dplyr$tbl(db, "labevents") %>% +# dplyr$filter(itemid %in% test_list_cmp) %>% +# dplyr$select(-charttime,-storetime) %>% +# tidyr$pivot_wider( +# id_cols = c(subject_id,specimen_id) +# ,names_from = itemid +# ,values_from = valuenum +# ) %>% +# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% +# dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) %>% +# dplyr$collect() - -ds %>% dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) - -count <- data.frame(colSums(is.na(ds))) %>% tibble::rownames_to_column() - - -testds <- readr::read_csv( - here("ML","data-unshared", "labevents.csv") - ,col_types = "_d_ddTT_d______" - ,n_max = 100 -) - - -#using chart time instead of spceimen id results in less NA values. -# total protien still have very low resulting - -ds1 <- dplyr$tbl(db, "labevents") %>% - dplyr$filter(itemid %in% test_list) %>% +ds_cmp <- dplyr$tbl(db, "labevents") %>% + dplyr$filter(itemid %in% test_list_cmp) %>% dplyr$select(-storetime) %>% tidyr$pivot_wider( id_cols = c(subject_id,charttime) @@ -84,13 +88,24 @@ ds1 <- dplyr$tbl(db, "labevents") %>% dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% dplyr$collect() -count2 <- data.frame(colSums(is.na(ds1))) %>% tibble::rownames_to_column() +#this keeps failing if run as part of the above query. Moving here to keep going +# keeps only rows that have values for all columns +ds_cmp <- ds_cmp %>% dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) -counts <- count %>% - dplyr$left_join(count2) -# using charttime, total of 5,424 rows with all values filled in -ds1 %>% dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) +ds_bmp <- dplyr$tbl(db, "labevents") %>% + dplyr$filter(itemid %in% test_list_bmp) %>% + dplyr$select(-storetime) %>% + tidyr$pivot_wider( + id_cols = c(subject_id,charttime) + ,names_from = itemid + ,values_from = valuenum + ) %>% + dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% + dplyr$collect() + +ds_bmp <- ds_bmp %>% dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) + # close database ----------------------------------------------------------