quarto-blog/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-1.R

#These first few lines run only when the file is run in RStudio, !!NOT when an Rmd/Rnw file calls it!!
rm(list=ls(all=TRUE))  #Clear the variables from previous runs.
cat("\f") # clear console

# ---- load-packages --------------------------------------------------
# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
library(magrittr) # enables piping : %>%
library(dplyr)    # data wrangling
library(ggplot2)  # graphs
library(tidyr)    # data tidying
library(maps)
library(mapdata)
library(sf)
library(readr)


# ---- load-sources ---------------------------------------------------


# ---- declare-globals ----------------------------------------------------

#set ggplot theme
ggplot2::theme_set(theme_bw())

# ---- load-data ------------------------------------------------------

# load the data, and have all column names in lowercase

nc_diabetes_data_raw <- read_csv("https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/derived/nc-diabetes-data.csv") %>%
  rename_all(tolower)

us_diabetes_data_raw <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/raw/us_diabetes_totals.csv"
                                 ,skip = 2)

rural_counties <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/rural-counties.csv")

county_centers_raw <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/nc_county_centers.csv", col_names = c("county", "lat","long"))

diabetes_atlas_data_raw <- read_csv("https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/DiabetesAtlasData.csv"
                                    ,col_types = cols(LowerLimit = col_skip(),
                                                     UpperLimit = col_skip(),
                                                     Percentage = col_double()), skip = 2)


# ---- load-map-data ----------------------------------------------------------

# load in both US State Map and NC County Map

nc_counties_map_raw <- st_as_sf(map("county",region = "north carolina", plot = FALSE,fill = TRUE)) %>%
  mutate_at("ID", ~stringr::str_remove(.,"north carolina,"))

state_map_raw <- st_as_sf(map("state",plot = FALSE,fill = TRUE ))

nc_cities <-  st_as_sf(read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/nc_cities.csv"),
                       coords = c("long", "lat")
                       ,remove = FALSE
                       ,agr = "constant"
                       ,crs = 4326)


# ---- tweak-data --------------------------------------------------------------

county_centers <- county_centers_raw %>%
  mutate_all(~stringr::str_replace_all(.,
                                       c("\\°"  = ""
                                         ,"\\+" = ""
                                         ,"\\–" = "-"
                                       )
  )
  ) %>%
  mutate(across(c("lat","long"), ~iconv(.,from = 'UTF-8', to = 'ASCII//TRANSLIT'))
         ,across(c("lat","long"),~stringr::str_remove_all(.,"\\?"))) %>%
  mutate_at(c("lat","long"),as.numeric) %>%
  mutate(across("long", ~(. * -1))) %>%
  mutate_at("county", tolower)


us_diabetes_data <- us_diabetes_data_raw %>%
  filter(Year >= 2000) %>%
  select( "Year","Total - Percentage") %>%
  rename(year = Year , us_pct = `Total - Percentage`)

diabetes_atlas_data <- diabetes_atlas_data_raw %>%
  mutate_at("State", tolower) %>%
  filter(Year >= 2000)

state_map_abb <- state_map_raw %>%
  left_join(read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/state-abb.csv") %>%
              mutate_at("state", tolower)
            ,by = c("ID" = "state") )


# ---- merge-data ---------------------------------------------------------

#join US totals to NC data

nc_diabetes_data <- nc_diabetes_data_raw %>%
  mutate_at("county", ~stringr::str_replace_all(.,"Mcdowell","McDowell")) %>%
  mutate(
    rural = county %in% rural_counties$rural_counties
  ) %>%
  mutate_at("county",tolower) %>%
  left_join(us_diabetes_data)


nc_counties_map <- nc_counties_map_raw %>%
  left_join(nc_diabetes_data, by = c("ID" = "county")) %>%
  left_join(county_centers, by = c("ID" = "county")) %>%
  rename(
    center_long = long
    ,center_lat = lat)

state_map <- state_map_abb %>%
  left_join(diabetes_atlas_data, by = c("ID" = "State")) %>%
  rename_all(tolower)


# ---- o-g1 ------------------------------------------------------------------


us_diabetes_data <- us_diabetes_data %>%
  mutate(
    change = lead(us_pct) - us_pct
    ,change = if_else(change > 0, TRUE, FALSE)
  ) %>%
  mutate_at("change", ~stringr::str_replace_na(.,"NA"))


o_g1 <- us_diabetes_data %>%
  ggplot(aes(x = year, y = us_pct)) +
  geom_line(color= "#D95F02") +
  # geom_line(aes(color = change, group = 1)) +
  geom_point(shape = 21, size = 3,color= "#D95F02") +
  # geom_point(aes(color = change),shape = 21, size = 3) +
  scale_color_manual(values = c(
    "TRUE" = "#D95F02"
    ,"FALSE" = "#7570B3"
  ), guide = FALSE) +
  labs(
    title    = "Percentage of Diagnosed Diabetes in Adults (18+), National Level"
    ,x       = NULL
    ,y       = NULL
    ,caption = "Note: Data from the CDC's National Health Interview Survey (NHIS)"
  )

o_g1


# ---- s-g1 -----------------------------------------------------------------


s_g1 <- state_map %>%
  st_drop_geometry() %>%
  ggplot(aes(x = year, y = percentage, color = region)) +
  geom_line(aes(group = id ),alpha = 0.3,na.rm = TRUE) +
  geom_smooth(method = "lm", se = FALSE) +
  ggpmisc::stat_poly_eq(formula = y ~ + x ,
                        aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
                        parse = TRUE) +
  geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") +
  scale_color_brewer(palette    = "Dark2"
                     ,direction = -1
                     ,labels    = snakecase::to_title_case
  ) +
  labs(
    title    = "Percentage of Diagnosed Diabetes in Adults (18+) \nby State and Region"
    ,x       = NULL
    ,y       = NULL
    ,color   = "Region"
    ,caption = "Regions from US Census Bureau"
  )

s_g1


# ---- s-g2 ---------------------------------------------------------------

s_g2 <- state_map %>%
  st_drop_geometry() %>%
  filter(region == "south") %>%
  mutate_at("id", ~snakecase::to_title_case(.)) %>%
  ggplot(aes(x = year, y = percentage)) +
  geom_line(aes(group = id ),na.rm = TRUE, color= "#D95F02") +
  gghighlight::gghighlight(id == "North Carolina", label_params = list(vjust = 3)) +
  scale_y_continuous(breaks = seq(5,13,2)) +
  scale_x_continuous(minor_breaks = seq(2000,2016,1)) +
  labs(
    title    = "Percentage of Diagnosed Diabetes in Adults (18+) \nSouth Region"
    ,x       = NULL
    ,y       = NULL
    ,caption = "Regions from US Census Bureau"
  ) +
  theme()

s_g2


# ---- nc-g1 ----------------------------------------------------------------------

d1 <- nc_diabetes_data %>%
  group_by(year) %>%
  summarise(
    pct = mean(percentage)
    ,us_pct = mean(us_pct)
  ) %>%
  pivot_longer(
    cols       = c("pct", "us_pct")
    ,names_to  = "metric"
    ,values_to = "values"
  ) %>%
  mutate(
    metric = factor(metric
                    ,levels = c("pct","us_pct")
                    ,labels = c("NC", "National"))
  )

nc_g1 <- d1 %>%
  ggplot(aes(x = year, y = values, color = metric)) +
  geom_line() +
  geom_point(shape = 21, size = 3) +
  geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  scale_color_brewer(palette = "Dark2") +
  labs(
    x      = NULL
    ,y     = NULL
    ,color = NULL
    ,title = "Percent of Adults (20+) with Diagnosed Diabetes"
  )

nc_g1

# ---- nc-data-aberration ---------------------------------------------------

nc_g1a <- nc_diabetes_data %>%
  ggplot(aes(x = year, y = percentage)) +
  geom_line(aes(group = county),alpha = 0.4) +
  labs(
    x = NULL
    ,y = NULL
    ,color = NULL
  )

nc_g1a


# ---- nc-g2 -----------------------------------------------------------------

d2 <- nc_diabetes_data %>%
  select(-us_pct) %>%
  mutate(
    pct_rural  = if_else(rural == TRUE, percentage, FALSE)
    ,pct_urban = if_else(rural == FALSE, percentage, FALSE)
  ) %>%
  select(-countyfips,-percentage) %>%
  group_by(year) %>%
  summarise(
    pct_rural = mean(pct_rural,na.rm = TRUE)
    ,pct_urban = mean(pct_urban,na.rm = TRUE)
  ) %>% left_join(us_diabetes_data) %>%
  pivot_longer(
    cols       = c("us_pct", "pct_rural","pct_urban")
    ,names_to  = "metric"
    ,values_to = "value"
    ,values_drop_na = TRUE
  ) %>%
  mutate(
    metric = factor(metric,
                    levels  = c("pct_rural","pct_urban","us_pct")
                    ,labels = c("Rural","Urban","US")
    )
  )

nc_g2 <- d2 %>% ggplot(aes(x = year, y = value, color = metric)) +
  geom_line() +
  geom_point(shape = 21, size = 3) +
  geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  scale_color_brewer(palette = "Dark2") +
  labs(
    x      = NULL
    ,y     = NULL
    ,color = NULL
    ,title = "Percent of Adults (20+) with Diagnosed Diabetes \nDisplaying Rural vs Urban"
  )

nc_g2

# ---- spaghetti-plot ----------------------------------------------------

g50 <- nc_diabetes_data %>%
  filter(year < 2015) %>%
  mutate(
    rural = factor(rural
                   ,levels = c(TRUE,FALSE)
                   ,labels = c("Rural", "Urban")
    )
  ) %>%
  ggplot(aes(x = year, y = percentage, color = rural)) +
  geom_line(aes(group = county),alpha = 0.3) +
  geom_smooth(aes(group = rural), method = "loess", se= FALSE, size = 1.1) +
  scale_color_brewer(palette = "Dark2") +
  labs(
    title = "Percent of Adults (20+) with Diagnosed Diabetes \nAll North Carolina Counties"
    ,x = NULL
    ,y = NULL
    ,color = NULL
  )

g50


# ---- c-g1 --------------------------------------------------------------


nc_counties_map_binned <- nc_counties_map %>%
  filter(year < 2015) %>%
  mutate(
    bin = dlookr::binning(.$percentage, nbins = 6 ,type = "equal")
    ,bin = forcats::fct_recode(bin
                               ,"6.5   - 7.9"  =  "[6.5,7.97]"
                               ,"8.0  - 9.4" =  "(7.97,9.43]"
                               ,"9.5  - 10.9" =  "(9.43,10.9]"
                               ,"11.0 - 12.4" =  "(10.9,12.4]"
                               ,"12.5 - 13.8" =  "(12.4,13.8]"
                               ,"13.9 - 15.3" =  "(13.8,15.3]"
    )
  )

c_g1 <- nc_counties_map_binned %>%
  filter(year %in% c(2006,2014)) %>%
  ggplot() +
  geom_sf() + #blank geom_sf keeps gridlines from overlapping map
  geom_sf(aes(fill = bin,color = rural)) +
  geom_sf(data = nc_cities) +
  ggrepel::geom_text_repel(data = nc_cities,
                           aes(x = long, y = lat, label = city)
                           ,nudge_y = c(-1,1,1,-1,1)
                           ,nudge_x = c(0,0,0,-1,0)
  ) +
  geom_text(data = . %>% filter(rural == TRUE)
            ,aes(x = center_long, y = center_lat)
            ,label = "R"
            ,color = "#696969"
  ) +
  coord_sf(xlim = c(-84.5,-75.5), ylim = c(33.75,37)) +
  facet_wrap(~year) +
  scale_fill_viridis_d(alpha = 0.6, direction = -1) +
  scale_color_manual(
    values = c(
      "FALSE" = "gray"
      ,"TRUE" = "black"
    ),guide = 'none') +
  labs(
    title = "Estimated Diabetes in Adults (20+) by County"
    ,fill = "Percentage"
    ,y    = NULL
    ,x    = NULL
  ) +
  theme(
    panel.background = element_rect(fill = "aliceblue")
    ,panel.grid.major = element_line(color = "#D4D4D4", linetype = "dashed",
                                     size = 0.5)
    ,legend.position = "bottom"
    ,plot.title = element_text(hjust = 0.5)
  )

c_g1

# ---- county-distribution-histogram ------------------------------------

# Not USED
c_g1a <- nc_counties_map_binned %>%
  mutate(
    rural = factor(rural
                   ,levels = c(TRUE,FALSE)
                   ,labels = c("Rural", "Urban")
    )
  ) %>%
  filter(year %in% c(2006,2014)) %>%
  ggplot(aes(x = bin, fill = rural)) +
  geom_bar(stat = "count"
           ,position = "dodge"
  ) +
  geom_text(aes(label=..count..)
            ,position = position_dodge(width = 1)
            ,stat = "count"
            ,vjust = -0.1
            ,size = 5) +
  facet_wrap(~year) +
  scale_fill_brewer(palette = "Dark2") +
  labs(
    fill = NULL
    ,x   = NULL
    ,y   = NULL
  )

c_g1a

# ---- county-boxplot ----

c_g1c <- nc_counties_map %>%
  mutate(
    rural = factor(rural
                   ,levels = c(TRUE,FALSE)
                   ,labels = c("Rural", "Urban")
    )) %>%
  filter(year < 2015) %>%
  ggplot(aes(x = year, y = percentage, group = interaction(year,rural), fill = rural)) +
  geom_boxplot(alpha = 0.5) +
  scale_fill_brewer(palette = "Dark2") +
  scale_x_continuous(breaks = seq(2004,2014,2)) +
  labs(
    x      = NULL
    ,y     = NULL
    ,fill  = NULL
    ,title = "Distribution  of Estimated Cases by County 2006 - 2014"
  )

c_g1c


# ---- c-g4 ---------------------------------------------------------------
d3 <- nc_counties_map %>%
  st_drop_geometry() %>%
  filter(year %in% c(2006,2014)) %>%
  select(-countyfips,-us_pct) %>%
  pivot_wider(names_from = "year"
              ,values_from = "percentage") %>%
  mutate(
    pct_p  = `2014` - `2006`
    ,pct_c = ((`2014` - `2006`)/`2006`) * 100
  ) %>%
  left_join(nc_counties_map_raw) %>%
  st_as_sf()


c_g4 <- d3 %>%
  ggplot() +
  geom_sf() + #blank geom_sf keeps gridlines from overlapping map
  geom_sf(aes(fill = pct_c ,color = rural)) +
  geom_sf(data = nc_cities) +
  ggrepel::geom_text_repel(data = nc_cities,
                           aes(x = long, y = lat, label = city)
                           ,nudge_y = c(-1,1,1,-1,1)
                           ,nudge_x = c(0,0,0,-1,0)
  ) +
  geom_text(data = . %>% filter(rural == TRUE)
            ,aes(x = center_long, y = center_lat)
            ,label = "R"
            ,color = "#696969"
  ) +
  # scale_fill_viridis_c(alpha = 0.6, direction = -1) +
  scale_fill_gradient2(
    low = "#d01c8b"
    ,mid = "#f7f7f7"
    ,high = "#4dac26"
    ,midpoint = 0
  ) +
  scale_color_manual(
    values = c(
      "FALSE" = "gray"
      ,"TRUE" = "black"
    ),guide = 'none') +
  labs(
    title = "Percentage Change of Diagnosed Diabetes 2006-2014"
    ,fill = "Percentage"
    ,y    = NULL
    ,x    = NULL
  ) +
  theme(
    panel.background = element_rect(fill = "aliceblue")
    ,panel.grid.major = element_line(color = "#D4D4D4", linetype = "dashed",
                                     size = 0.5)
  )

c_g4


# ---- pct_p-histogram ----------------------------------------------------------


d4 <- d3 %>%
  st_drop_geometry() %>%
  mutate(
    rural = factor(rural
                   ,levels = c(TRUE,FALSE)
                   ,labels = c("Rural", "Urban")
    )
  )


mean_d4 <- d4 %>%
  group_by(rural) %>%
  summarise(.groups = "keep"
            ,pct_c = mean(pct_c)
  )

g51 <-  d4 %>%
  ggplot(aes(x = pct_c, fill = rural, y = ..density.., color = rural)) +
  geom_histogram(binwidth = 5, position = "identity", alpha = 0.3) +
  geom_density(alpha = 0.5) +
  facet_wrap(~rural, ncol = 1)  +
  geom_vline(aes(xintercept = pct_c), data = mean_d4) +
  geom_text(aes(x = pct_c, y = 0.038, label = round(pct_c, 2))
            ,data  = mean_d4
            ,hjust = -0.15
            ,size  = 5
            ,color = "#000000") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "#696969") +
  scale_color_brewer(palette = "Dark2", guide = NULL) +
  scale_fill_brewer(palette = "Dark2", guide = NULL) +
  labs(
    x = "Percentage Change"
    ,y = "Density"
    ,fill = NULL
  )
g51