Update 0-data_prep.R
create unified data set
This commit is contained in:
parent
9adf510b0f
commit
92c8e68f85
1 changed files with 35 additions and 64 deletions
|
@ -24,25 +24,6 @@ db <- dbConnect(
|
|||
|
||||
#item list shows two different numbers for a few tests, second set of items do not have
|
||||
# any results that are on the same samples as TSH and Free T4
|
||||
test_list_cmp <- c(
|
||||
50862 #Albumin
|
||||
,50863 #Alkaline Phosphatase
|
||||
,50861 #Alanine Aminotransferase (ALT)
|
||||
,50878 #Asparate Aminotransferase (AST)
|
||||
,51006 #Urea Nitrogen
|
||||
,50893 #Calcium, Total
|
||||
,50882 #Bicarbonate
|
||||
,50902 #Chloride
|
||||
,50912 #Creatinine
|
||||
,50931 #Glucose
|
||||
,50971 #Potassium
|
||||
,50983 #Sodium
|
||||
,50885 #Bilirubin, Total
|
||||
,50976 #Protein, Total
|
||||
,50993 #Thyroid Stimulating Hormone
|
||||
,50995 #Thyroxine (T4), Free
|
||||
)
|
||||
|
||||
# 51301 and 51300 looks like test name may have changed
|
||||
test_list_bmp <- c(
|
||||
51006 #Urea Nitrogen
|
||||
|
@ -63,6 +44,24 @@ test_list_bmp <- c(
|
|||
,51265 #Platelet Count
|
||||
)
|
||||
|
||||
test_list_names <- c(
|
||||
"BUN" = "51006"
|
||||
,"CA" = "50893"
|
||||
,"CO2" = "50882"
|
||||
,"CL" = "50902"
|
||||
,"CREA" = "50912"
|
||||
,"GLU" = "50931"
|
||||
,"K" = "50971"
|
||||
,"NA" = "50983"
|
||||
,"TSH" = "50993"
|
||||
,"FT4" = "50995"
|
||||
,"RBC" = "51279"
|
||||
,"WBC" = "51300"
|
||||
,"HCT" = "51221"
|
||||
,"HGB" = "51222"
|
||||
,"PLT" = "51265"
|
||||
)
|
||||
|
||||
# TSH Ref Range from File 0.27 - 4.2 uIU/mL
|
||||
# Free T4 Ref Range from File 0.93 - 1.7 ng/dL
|
||||
|
||||
|
@ -73,38 +72,8 @@ patients <- dplyr$tbl(db, "patients") %>%
|
|||
dplyr$select(-anchor_year, -anchor_year_group, -dod) %>%
|
||||
dplyr$collect()
|
||||
|
||||
# first is using specimen id, usable data set is using chart time as it appears
|
||||
# usable data set is using chart time as it appears
|
||||
# LIS uses different id's for groups of tests
|
||||
#
|
||||
# ds_cmp <- dplyr$tbl(db, "labevents") %>%
|
||||
# dplyr$filter(itemid %in% test_list_cmp) %>%
|
||||
# dplyr$select(-charttime,-storetime) %>%
|
||||
# tidyr$pivot_wider(
|
||||
# id_cols = c(subject_id,specimen_id)
|
||||
# ,names_from = itemid
|
||||
# ,values_from = valuenum
|
||||
# ) %>%
|
||||
# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
|
||||
# dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) %>%
|
||||
# dplyr$collect()
|
||||
#No longer using this, but saving incase
|
||||
# ds_cmp <- dplyr$tbl(db, "labevents") %>%
|
||||
# dplyr$filter(itemid %in% test_list_cmp) %>%
|
||||
# dplyr$select(-storetime) %>%
|
||||
# tidyr$pivot_wider(
|
||||
# id_cols = c(subject_id,charttime)
|
||||
# ,names_from = itemid
|
||||
# ,values_from = valuenum
|
||||
# ) %>%
|
||||
# dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
|
||||
# dplyr$collect()
|
||||
#
|
||||
# #this keeps failing if run as part of the above query. Moving here to keep going
|
||||
# # keeps only rows that have no more then three NA's
|
||||
# ds_cmp <- patients %>%
|
||||
# dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>%
|
||||
# dplyr$filter(rowSums(is.na(.)) <= 3)
|
||||
|
||||
# BMP and CBC Results together
|
||||
|
||||
ds_bmp <- dplyr$tbl(db, "labevents") %>%
|
||||
|
@ -115,32 +84,34 @@ ds_bmp <- dplyr$tbl(db, "labevents") %>%
|
|||
,names_from = itemid
|
||||
,values_from = valuenum
|
||||
) %>%
|
||||
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
|
||||
dplyr$collect()
|
||||
|
||||
|
||||
ds_bmp <- ds_bmp %>%
|
||||
ds1 <- ds_bmp %>%
|
||||
dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
|
||||
dplyr$left_join(patients, by = c("subject_id" = "subject_id")) %>%
|
||||
dplyr$mutate(dplyr$across(`51300`, ~dplyr$if_else(!is.na(.),`51300`,`51301`))) %>%
|
||||
dplyr$select(-`51301`) %>%
|
||||
# dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
|
||||
dplyr$filter(rowSums(is.na(.)) <= 2) #allows for 2 missing test
|
||||
dplyr$filter(rowSums(is.na(.)) <= 3) #allows for 3 missing test
|
||||
|
||||
ds_final <- ds1 %>%
|
||||
dplyr$mutate(
|
||||
ft4_dia = dplyr$case_when(
|
||||
`50993` > 4.2 & `50995` < 0.93 ~ "Hypo"
|
||||
,`50993` > 4.2 & `50995` > 0.93 ~ "Non-Hypo"
|
||||
,`50993` < 0.27 & `50995` > 1.7 ~ "Hyper"
|
||||
,`50993` < 0.27 & `50995` < 1.7 ~ "Non-Hyper"
|
||||
,TRUE ~ "Normal TSH"
|
||||
)
|
||||
) %>%
|
||||
dplyr$rename(!!!test_list_names) %>%
|
||||
dplyr$relocate(gender, anchor_age)
|
||||
|
||||
# save data ---------------------------------------------------------------
|
||||
|
||||
ds_final %>% readr$write_rds(here("ML","data-unshared","ds_final.RDS"))
|
||||
|
||||
ds_high_tsh <- ds_bmp %>%
|
||||
dplyr$filter(`50993` > 4.2) %>%
|
||||
readr$write_rds(
|
||||
here("ML","data-unshared","ds_high_tsh.RDS")
|
||||
)
|
||||
|
||||
ds_low_tsh <- ds_bmp %>%
|
||||
dplyr$filter(`50993` < 0.27) %>%
|
||||
readr$write_rds(
|
||||
here("ML","data-unshared","ds_low_tsh.RDS")
|
||||
)
|
||||
|
||||
# close database ----------------------------------------------------------
|
||||
|
||||
|
|
Loading…
Reference in a new issue