rm(list = ls(all.names = TRUE)) # Clear the memory of variables from previous run.
cat("\014") # Clear the console


# load packages -----------------------------------------------------------

box::use(
  magrittr[`%>%`]
  ,here[here]
  ,dplyr
  ,readr
  ,tidyr
  ,gp2 = ggplot2[ggplot, aes]
  ,gtsummary
  ,GGally
)


# globals -----------------------------------------------------------------


# load data ---------------------------------------------------------------

ds0 <- readr$read_rds(here("ML","data-unshared","ds_final.RDS"))

# data manipulation -------------------------------------------------------

#here I am adding a column to determine if the Free T4 Value is diagnostic or not
# using the FT4 Referance range low as the cut off (0.93)


ds1 <- ds0 %>%
  dplyr$mutate(dplyr$across(
    ft4_dia
    , ~factor(., levels = c("Hypo", "Non-Hypo","Hyper", "Non-Hyper")
              )
    )
  )

ds_recode <- ds1 %>%
  dplyr$mutate(
    dplyr$across(
      gender
      ,~dplyr$recode(.,"M" = 1, "F" = 2)
    )
    ,dplyr$across(
      ft4_dia
      ,~dplyr$recode(.
                     ,"Hypo"       = 1
                     ,"Non-Hypo"   = 2
                     ,"Hyper"      = 3
                     ,"Non-Hyper"  = 4
      )
    )
  )


# basic visualization -----------------------------------------------------

#summary Table

summary_tbl <- ds1 %>%
  dplyr$select(-subject_id, -charttime) %>%
  gtsummary$tbl_summary(
    by = ft4_dia
    ,missing = "no"
    ,type = gtsummary$all_continuous() ~ "continuous"
    ,label = list(
      gender ~ "Gender"
      ,anchor_age ~ "Age"
    )
    ,statistic = gtsummary$all_continuous() ~ c("{median} ({p25}, {p75})")
  ) %>%
  # gtsummary$bold_labels() %>%
  gtsummary$add_n(statistic = "{p_miss}", col_label = "**% Missing**") %>%
  gtsummary$modify_header(label = "**Variable**") %>%
  gtsummary$modify_spanning_header(gtsummary$all_stat_cols() ~ "**Free T4 Outcome**")


# summary_tbl


# corr-plot ---------------------------------------------------------------

corr_plot <- ds1 %>%
  dplyr$select(-gender,-ft4_dia, -subject_id, -charttime) %>%
  dplyr$rename(Age = anchor_age) %>%
  GGally$ggcorr(nbreaks = 5, palette = "Greys"
                ,label = TRUE, label_size = 3, label_color = "white"
                  ,label_round = 2
                ,hjust = 0.75
                ,layout.exp = 1)

# corr_plot

gp2$ggsave(
  here("figures","corr_plot.emf")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
  ,device = devEMF::emf
)
gp2$ggsave(
  here("figures","corr_plot.png")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
)


#quick recode of gender, will still do recoding during feature engineering
g1 <- ds1 %>%
  dplyr$select(-gender,-ft4_dia, -subject_id, -charttime) %>%
  tidyr$pivot_longer(cols = dplyr$everything()) %>%
  ggplot(aes(x = value)) +
  gp2$geom_histogram(na.rm = TRUE) +
  gp2$facet_wrap(~name, scales = "free") +
  gp2$theme_bw() +
  gp2$labs(
    x = NULL
    ,y = NULL
  )

# g1

gp2$ggsave(
  here("figures","distrubution_histo.emf")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
  ,device = devEMF::emf
)
gp2$ggsave(
  here("figures","distrubution_histo.png")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
)

# this takes a bit to load.  No discernible patterns in the data
g2 <- ds_recode %>%
  dplyr$select(-gender, -subject_id, -charttime) %>%
  dplyr$mutate(dplyr$across(-ft4_dia, log)) %>%
  tidyr$pivot_longer(cols = !ft4_dia) %>%
  ggplot(aes(x = factor(ft4_dia), y = value, fill = factor(ft4_dia))) +
  gp2$stat_boxplot(geom = "errorbar", na.rm = TRUE) +
  gp2$geom_boxplot(na.rm = TRUE, outlier.shape = NA) +
  gp2$facet_wrap(~name, scales = "free") +
  gp2$theme_bw() +
  gp2$scale_fill_brewer(
    palette = "Greys"
    ,labels = c("1 - Hypo","2 - Non-Hypo","3 - Hyper","4 - Non-Hyper")
  ) +
  gp2$labs(
    x = NULL
    ,y = NULL
    ,fill = "Lab Diagnosis"
    ,caption = "Note. All values log transformed"
  ) +
  gp2$theme(
    plot.caption = gp2$element_text(hjust = 0)
  )

# g2
gp2$ggsave(
  here("figures","boxplot.emf")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
  ,device = devEMF::emf
  )
gp2$ggsave(
  here("figures","boxplot.png")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
)


# save-data ---------------------------------------------------------------

ds1 %>% readr$write_rds(here("ML","data-unshared","model_data.RDS"))