rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.
cat("\014") # Clear the console


# load packages -----------------------------------------------------------

box::use(
  magrittr[`%>%`]
  ,here[here]
  ,dplyr
  ,readr
  ,tidyr
  ,gp2 = ggplot2[ggplot, aes]
)


# globals -----------------------------------------------------------------

test_list_names <- c(
   "BUN"   = "51006"
   ,"CA"   = "50893"
   ,"CO2"  = "50882"
   ,"CL"   = "50902"
   ,"CREA" = "50912"
   ,"GLU"  = "50931"
   ,"K"    = "50971"
   ,"NA"   = "50983"
   ,"TSH"  = "50993"
   ,"FT4"  = "50995"
   ,"RBC"  = "51279"
   ,"WBC"  = "51300"
   ,"HCT"  = "51221"
   ,"HGB"  = "51222"
   ,"PLT"  = "51265"
)


# load data ---------------------------------------------------------------

ds_high_tsh_raw <-   readr$read_rds(
  here("ML","data-unshared","ds_high_tsh.RDS")
  )

ds_low_tsh_raw <- readr$read_rds(
  here("ML","data-unshared","ds_low_tsh.RDS")
  )


# data manipulation -------------------------------------------------------

#here I am adding a column to determine if the Free T4 Value is diagnostic or not
# using the FT4 Referance range low as the cut off (0.93)


ds_high_tsh <- ds_high_tsh_raw %>%
  dplyr$mutate(ft4_dia = dplyr$if_else(`50995` < 0.93, 1, 0)) %>%
  #can rename with a vector using either of these
  # dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names))
  dplyr$rename(!!!test_list_names) %>%
  dplyr$select(-FT4)


ds_low_tsh <- ds_low_tsh_raw %>%
  dplyr$mutate(ft4_dia = dplyr$if_else(`50995` > 1.7, 1, 0)) %>%
  #can rename with a vector using either of these
  # dplyr$rename_with(~names(test_list_names), dplyr$all_of(test_list_names))
  dplyr$rename(!!!test_list_names) %>%
  dplyr$select(-FT4)


# basic visualization -----------------------------------------------------

#graph and table of missing tests

missing_count <- function(ds){

  df <- dplyr$as_tibble(colSums(is.na(ds)), rownames = NA ) %>%
    tibble::rownames_to_column()

  graph <- df %>%
    ggplot(aes(x = rowname, y = value)) +
    gp2$geom_col() +
    gp2$theme(
      axis.text.x = gp2$element_text(angle = 90)
    )

  return(
    list(
      df = df
      ,graph = graph
      )
    )

}

high_missing <- missing_count(ds_high_tsh)
low_missing <- missing_count(ds_low_tsh)

missing_table <- high_missing$df %>%
  dplyr$left_join(low_missing$df, by = "rowname")

#use this instead of making myself
ds_high_tsh %>% gtsummary::tbl_summary(by = ft4_dia)


# count of diagnostics ft4 and freq
t1 <- ds_high_tsh %>%
  dplyr$count(ft4_dia) %>%
  dplyr$mutate(freq = n/sum(n)) %>%
  knitr::kable()

t1

# correlation plot
ds_corr <- cor(ds_high_tsh %>% dplyr$select(-subject_id, - charttime)
               %>% dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2)))
               ,use = "complete.obs")


#code for saving corr plot
png(here("figures","corrplot_high.png"), type = 'cairo')
corrplot::corrplot(ds_corr, method = "number")
dev.off()


#quick recode of gender, will still do recoding during feature engineering
g1 <- ds_high_tsh %>%
  dplyr$select(-subject_id, - charttime) %>%
  dplyr$mutate(dplyr$across(gender, ~dplyr$recode(.,M = 1, F = 2))) %>%
  tidyr$pivot_longer(cols = dplyr$everything()) %>%
  ggplot(aes(x = value)) +
  gp2$geom_histogram(na.rm = TRUE) +
  gp2$facet_wrap(~name, scales = "free")
g1


# this takes a bit to load.  No discernable paterns in the data
g2 <- ds_high_tsh %>%
  dplyr$select(-gender, -subject_id, - charttime) %>%
  tidyr$pivot_longer(cols = !ft4_dia) %>%
  ggplot(aes(x = factor(ft4_dia), y = value, fill = factor(ft4_dia))) +
  gp2$geom_boxplot(outlier.shape = NA, na.rm = TRUE) +
  gp2$geom_jitter(size=.7, width=.1, alpha=.5, na.rm = TRUE) +
  gp2$facet_wrap(~name, scales = "free")
g2