diff --git a/ML/0-data_prep.R b/ML/0-data_prep.R index 9b180a6..ad4a7d2 100644 --- a/ML/0-data_prep.R +++ b/ML/0-data_prep.R @@ -24,25 +24,6 @@ db <- dbConnect( #item list shows two different numbers for a few tests, second set of items do not have # any results that are on the same samples as TSH and Free T4 -test_list_cmp <- c( - 50862 #Albumin - ,50863 #Alkaline Phosphatase - ,50861 #Alanine Aminotransferase (ALT) - ,50878 #Asparate Aminotransferase (AST) - ,51006 #Urea Nitrogen - ,50893 #Calcium, Total - ,50882 #Bicarbonate - ,50902 #Chloride - ,50912 #Creatinine - ,50931 #Glucose - ,50971 #Potassium - ,50983 #Sodium - ,50885 #Bilirubin, Total - ,50976 #Protein, Total - ,50993 #Thyroid Stimulating Hormone - ,50995 #Thyroxine (T4), Free -) - # 51301 and 51300 looks like test name may have changed test_list_bmp <- c( 51006 #Urea Nitrogen @@ -63,6 +44,24 @@ test_list_bmp <- c( ,51265 #Platelet Count ) +test_list_names <- c( + "BUN" = "51006" + ,"CA" = "50893" + ,"CO2" = "50882" + ,"CL" = "50902" + ,"CREA" = "50912" + ,"GLU" = "50931" + ,"K" = "50971" + ,"NA" = "50983" + ,"TSH" = "50993" + ,"FT4" = "50995" + ,"RBC" = "51279" + ,"WBC" = "51300" + ,"HCT" = "51221" + ,"HGB" = "51222" + ,"PLT" = "51265" +) + # TSH Ref Range from File 0.27 - 4.2 uIU/mL # Free T4 Ref Range from File 0.93 - 1.7 ng/dL @@ -73,38 +72,8 @@ patients <- dplyr$tbl(db, "patients") %>% dplyr$select(-anchor_year, -anchor_year_group, -dod) %>% dplyr$collect() -# first is using specimen id, usable data set is using chart time as it appears +# usable data set is using chart time as it appears # LIS uses different id's for groups of tests -# -# ds_cmp <- dplyr$tbl(db, "labevents") %>% -# dplyr$filter(itemid %in% test_list_cmp) %>% -# dplyr$select(-charttime,-storetime) %>% -# tidyr$pivot_wider( -# id_cols = c(subject_id,specimen_id) -# ,names_from = itemid -# ,values_from = valuenum -# ) %>% -# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% -# dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) %>% -# dplyr$collect() -#No longer using this, but saving incase -# ds_cmp <- dplyr$tbl(db, "labevents") %>% -# dplyr$filter(itemid %in% test_list_cmp) %>% -# dplyr$select(-storetime) %>% -# tidyr$pivot_wider( -# id_cols = c(subject_id,charttime) -# ,names_from = itemid -# ,values_from = valuenum -# ) %>% -# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% -# dplyr$collect() -# -# #this keeps failing if run as part of the above query. Moving here to keep going -# # keeps only rows that have no more then three NA's -# ds_cmp <- patients %>% -# dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>% -# dplyr$filter(rowSums(is.na(.)) <= 3) - # BMP and CBC Results together ds_bmp <- dplyr$tbl(db, "labevents") %>% @@ -115,32 +84,34 @@ ds_bmp <- dplyr$tbl(db, "labevents") %>% ,names_from = itemid ,values_from = valuenum ) %>% - dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% dplyr$collect() -ds_bmp <- ds_bmp %>% +ds1 <- ds_bmp %>% + dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>% dplyr$left_join(patients, by = c("subject_id" = "subject_id")) %>% dplyr$mutate(dplyr$across(`51300`, ~dplyr$if_else(!is.na(.),`51300`,`51301`))) %>% dplyr$select(-`51301`) %>% # dplyr$filter(dplyr$if_all(.fns = ~!is.na(.))) - dplyr$filter(rowSums(is.na(.)) <= 2) #allows for 2 missing test + dplyr$filter(rowSums(is.na(.)) <= 3) #allows for 3 missing test +ds_final <- ds1 %>% + dplyr$mutate( + ft4_dia = dplyr$case_when( + `50993` > 4.2 & `50995` < 0.93 ~ "Hypo" + ,`50993` > 4.2 & `50995` > 0.93 ~ "Non-Hypo" + ,`50993` < 0.27 & `50995` > 1.7 ~ "Hyper" + ,`50993` < 0.27 & `50995` < 1.7 ~ "Non-Hyper" + ,TRUE ~ "Normal TSH" + ) + ) %>% + dplyr$rename(!!!test_list_names) %>% + dplyr$relocate(gender, anchor_age) # save data --------------------------------------------------------------- +ds_final %>% readr$write_rds(here("ML","data-unshared","ds_final.RDS")) -ds_high_tsh <- ds_bmp %>% - dplyr$filter(`50993` > 4.2) %>% - readr$write_rds( - here("ML","data-unshared","ds_high_tsh.RDS") - ) - -ds_low_tsh <- ds_bmp %>% - dplyr$filter(`50993` < 0.27) %>% - readr$write_rds( - here("ML","data-unshared","ds_low_tsh.RDS") - ) # close database ----------------------------------------------------------