Update 0-data_prep.R
create unified data set
This commit is contained in:
		
							parent
							
								
									9adf510b0f
								
							
						
					
					
						commit
						92c8e68f85
					
				
					 1 changed files with 35 additions and 64 deletions
				
			
		| 
						 | 
				
			
			@ -24,25 +24,6 @@ db <- dbConnect(
 | 
			
		|||
 | 
			
		||||
#item list shows two different numbers for a few tests, second set of items do not have
 | 
			
		||||
# any results that are on the same samples as TSH and Free T4
 | 
			
		||||
test_list_cmp <- c(
 | 
			
		||||
  50862 #Albumin
 | 
			
		||||
  ,50863	#Alkaline Phosphatase
 | 
			
		||||
  ,50861	#Alanine Aminotransferase (ALT)
 | 
			
		||||
  ,50878	#Asparate Aminotransferase (AST)
 | 
			
		||||
  ,51006	#Urea Nitrogen
 | 
			
		||||
  ,50893	#Calcium, Total
 | 
			
		||||
  ,50882	#Bicarbonate
 | 
			
		||||
  ,50902	#Chloride
 | 
			
		||||
  ,50912	#Creatinine
 | 
			
		||||
  ,50931	#Glucose
 | 
			
		||||
  ,50971	#Potassium
 | 
			
		||||
  ,50983	#Sodium
 | 
			
		||||
  ,50885	#Bilirubin, Total
 | 
			
		||||
  ,50976	#Protein, Total
 | 
			
		||||
  ,50993	#Thyroid Stimulating Hormone
 | 
			
		||||
  ,50995	#Thyroxine (T4), Free
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# 51301 and 51300 looks like test name may have changed
 | 
			
		||||
test_list_bmp <- c(
 | 
			
		||||
  51006	#Urea Nitrogen
 | 
			
		||||
| 
						 | 
				
			
			@ -63,6 +44,24 @@ test_list_bmp <- c(
 | 
			
		|||
  ,51265	#Platelet Count
 | 
			
		||||
  )
 | 
			
		||||
 | 
			
		||||
test_list_names <- c(
 | 
			
		||||
  "BUN"   = "51006"
 | 
			
		||||
  ,"CA"   = "50893"
 | 
			
		||||
  ,"CO2"  = "50882"
 | 
			
		||||
  ,"CL"   = "50902"
 | 
			
		||||
  ,"CREA" = "50912"
 | 
			
		||||
  ,"GLU"  = "50931"
 | 
			
		||||
  ,"K"    = "50971"
 | 
			
		||||
  ,"NA"   = "50983"
 | 
			
		||||
  ,"TSH"  = "50993"
 | 
			
		||||
  ,"FT4"  = "50995"
 | 
			
		||||
  ,"RBC"  = "51279"
 | 
			
		||||
  ,"WBC"  = "51300"
 | 
			
		||||
  ,"HCT"  = "51221"
 | 
			
		||||
  ,"HGB"  = "51222"
 | 
			
		||||
  ,"PLT"  = "51265"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# TSH Ref Range from File 0.27 - 4.2 uIU/mL
 | 
			
		||||
# Free T4 Ref Range from File 0.93 - 1.7 ng/dL
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -73,38 +72,8 @@ patients <- dplyr$tbl(db, "patients") %>%
 | 
			
		|||
  dplyr$select(-anchor_year, -anchor_year_group, -dod) %>%
 | 
			
		||||
  dplyr$collect()
 | 
			
		||||
 | 
			
		||||
# first is using specimen id, usable data set is using chart time as it appears
 | 
			
		||||
# usable data set is using chart time as it appears
 | 
			
		||||
# LIS uses different id's for groups of tests
 | 
			
		||||
#
 | 
			
		||||
# ds_cmp <- dplyr$tbl(db, "labevents") %>%
 | 
			
		||||
#   dplyr$filter(itemid %in% test_list_cmp) %>%
 | 
			
		||||
#   dplyr$select(-charttime,-storetime) %>%
 | 
			
		||||
#   tidyr$pivot_wider(
 | 
			
		||||
#     id_cols = c(subject_id,specimen_id)
 | 
			
		||||
#     ,names_from = itemid
 | 
			
		||||
#     ,values_from = valuenum
 | 
			
		||||
#     ) %>%
 | 
			
		||||
#   dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
 | 
			
		||||
#   dplyr$filter(dplyr$across(where(is.numeric), ~!is.na(.x))) %>%
 | 
			
		||||
#   dplyr$collect()
 | 
			
		||||
#No longer using this, but saving incase
 | 
			
		||||
# ds_cmp <- dplyr$tbl(db, "labevents") %>%
 | 
			
		||||
#   dplyr$filter(itemid %in% test_list_cmp) %>%
 | 
			
		||||
#   dplyr$select(-storetime) %>%
 | 
			
		||||
#   tidyr$pivot_wider(
 | 
			
		||||
#     id_cols = c(subject_id,charttime)
 | 
			
		||||
#     ,names_from = itemid
 | 
			
		||||
#     ,values_from = valuenum
 | 
			
		||||
#   ) %>%
 | 
			
		||||
#   dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
 | 
			
		||||
#   dplyr$collect()
 | 
			
		||||
#
 | 
			
		||||
# #this keeps failing if run as part of the above query.  Moving here to keep going
 | 
			
		||||
# # keeps only rows that have no more then three NA's
 | 
			
		||||
# ds_cmp <- patients %>%
 | 
			
		||||
#   dplyr$left_join(ds_cmp, by = c("subject_id" = "subject_id")) %>%
 | 
			
		||||
#   dplyr$filter(rowSums(is.na(.)) <= 3)
 | 
			
		||||
 | 
			
		||||
# BMP and CBC Results together
 | 
			
		||||
 | 
			
		||||
ds_bmp <- dplyr$tbl(db, "labevents") %>%
 | 
			
		||||
| 
						 | 
				
			
			@ -115,32 +84,34 @@ ds_bmp <- dplyr$tbl(db, "labevents") %>%
 | 
			
		|||
    ,names_from = itemid
 | 
			
		||||
    ,values_from = valuenum
 | 
			
		||||
  ) %>%
 | 
			
		||||
  dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
 | 
			
		||||
  dplyr$collect()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
ds_bmp <- ds_bmp %>%
 | 
			
		||||
ds1 <- ds_bmp %>%
 | 
			
		||||
  dplyr$filter(!is.na(`50993`) & !is.na(`50995`)) %>%
 | 
			
		||||
  dplyr$left_join(patients, by = c("subject_id" = "subject_id")) %>%
 | 
			
		||||
  dplyr$mutate(dplyr$across(`51300`, ~dplyr$if_else(!is.na(.),`51300`,`51301`))) %>%
 | 
			
		||||
  dplyr$select(-`51301`) %>%
 | 
			
		||||
  # dplyr$filter(dplyr$if_all(.fns = ~!is.na(.)))
 | 
			
		||||
  dplyr$filter(rowSums(is.na(.)) <= 2)  #allows for 2 missing test
 | 
			
		||||
  dplyr$filter(rowSums(is.na(.)) <= 3)  #allows for 3 missing test
 | 
			
		||||
 | 
			
		||||
ds_final <- ds1 %>%
 | 
			
		||||
  dplyr$mutate(
 | 
			
		||||
    ft4_dia = dplyr$case_when(
 | 
			
		||||
      `50993` > 4.2 & `50995` < 0.93 ~ "Hypo"
 | 
			
		||||
      ,`50993` > 4.2 & `50995` > 0.93 ~ "Non-Hypo"
 | 
			
		||||
      ,`50993` < 0.27 & `50995` > 1.7  ~ "Hyper"
 | 
			
		||||
      ,`50993` < 0.27 & `50995` < 1.7  ~ "Non-Hyper"
 | 
			
		||||
      ,TRUE ~ "Normal TSH"
 | 
			
		||||
    )
 | 
			
		||||
  ) %>%
 | 
			
		||||
  dplyr$rename(!!!test_list_names) %>%
 | 
			
		||||
  dplyr$relocate(gender, anchor_age)
 | 
			
		||||
 | 
			
		||||
# save data ---------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
ds_final %>% readr$write_rds(here("ML","data-unshared","ds_final.RDS"))
 | 
			
		||||
 | 
			
		||||
ds_high_tsh <- ds_bmp %>%
 | 
			
		||||
  dplyr$filter(`50993` > 4.2) %>%
 | 
			
		||||
  readr$write_rds(
 | 
			
		||||
    here("ML","data-unshared","ds_high_tsh.RDS")
 | 
			
		||||
  )
 | 
			
		||||
 | 
			
		||||
ds_low_tsh <- ds_bmp %>%
 | 
			
		||||
  dplyr$filter(`50993` < 0.27) %>%
 | 
			
		||||
  readr$write_rds(
 | 
			
		||||
    here("ML","data-unshared","ds_low_tsh.RDS")
 | 
			
		||||
  )
 | 
			
		||||
 | 
			
		||||
# close database ----------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue