diff --git a/.gitignore b/.gitignore index e597aad..b7a4307 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,3 @@ po/*~ rsconnect/ /.quarto/ - -# using this folder to move old blog posts over -old_posts/ \ No newline at end of file diff --git a/_quarto.yml b/_quarto.yml index ffd8a87..19bd8dc 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -1,9 +1,5 @@ project: type: website - render: - - "*.qmd" - - "*.rmd" - - "!old_posts/" #using for the time being to store old blog post while converting website: title: "Kyle Belanger" diff --git a/_site/blog.html b/_site/blog.html index 44f9f0a..d64b164 100644 --- a/_site/blog.html +++ b/_site/blog.html @@ -156,7 +156,7 @@ ul.task-list li input[type="checkbox"] { +
Categories
All (9)
Distill (1)
TidyTuesday (1)
@@ -204,7 +204,7 @@ Kyle Belanger -
+
@@ -267,7 +267,35 @@ Kyle Belanger
-
+
+
+

+
+ + +
+
@@ -295,7 +323,7 @@ Kyle Belanger
-
+
@@ -323,7 +351,7 @@ Kyle Belanger
-
+
@@ -351,7 +379,7 @@ Kyle Belanger
-
+
@@ -379,7 +407,7 @@ Kyle Belanger
-
+
diff --git a/_site/listings.json b/_site/listings.json index 881170c..244f892 100644 --- a/_site/listings.json +++ b/_site/listings.json @@ -5,6 +5,7 @@ "/posts/2021-02-26_tidytuesday-hbcu-enrollment/tidytuesday-2021-week-6-hbcu-enrolment.html", "/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html", "/posts/2020-07-25_diabetes-data-collection-and-cleaning/diabetes-in-rural-north-carolina-data-collection-and-cleaning.html", + "/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.html", "/posts/2020-06-22_excel-data-multiple-headers/importing-excel-data-with-multiple-header-rows.html", "/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html", "/posts/2020-02-10_line-graphs-and-interactivity/line-graphs-and-interactivity.html", diff --git a/_site/posts/2020-01-04_my-start-to-r/my-start-to-r.html b/_site/posts/2020-01-04_my-start-to-r/my-start-to-r.html index e4567b0..967a773 100644 --- a/_site/posts/2020-01-04_my-start-to-r/my-start-to-r.html +++ b/_site/posts/2020-01-04_my-start-to-r/my-start-to-r.html @@ -255,7 +255,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin -

Citation

BibTeX citation:
@online{belanger2020,
+

Reuse

Citation

BibTeX citation:
@online{belanger2020,
   author = {Belanger, Kyle},
   title = {My {Start} to {R}},
   date = {2020-01-24},
diff --git a/_site/posts/2020-02-10_line-graphs-and-interactivity/line-graphs-and-interactivity.html b/_site/posts/2020-02-10_line-graphs-and-interactivity/line-graphs-and-interactivity.html
index ce0ce3e..702fb44 100644
--- a/_site/posts/2020-02-10_line-graphs-and-interactivity/line-graphs-and-interactivity.html
+++ b/_site/posts/2020-02-10_line-graphs-and-interactivity/line-graphs-and-interactivity.html
@@ -332,8 +332,8 @@ $ fiscal_year                              <fct> 2013, 2013, 2013, 2013, 2
 
 g2
-
- +
+
diff --git a/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-5-1.png b/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-5-1.png index 6654801..4ec0b1d 100644 Binary files a/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-5-1.png and b/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-7-1.png b/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-7-1.png index 76dadac..0684d7d 100644 Binary files a/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-7-1.png and b/_site/posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.html b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.html new file mode 100644 index 0000000..724f156 --- /dev/null +++ b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.html @@ -0,0 +1,1304 @@ + + + + + + + + + + + +Kyle Belanger - Diabetes in Rural North Carolina : Exploring Prevalence Trends + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+ +
+
+
+

Diabetes in Rural North Carolina : Exploring Prevalence Trends

+

This post introduces the exploration of the Diabetes epidemic in North Carolina

+
+
+ + +
+ +
+
Author
+ +
+ +
+
Published
+
+

June 25, 2020

+
+
+ + +
+ + +
+ + +
+ + + +
+ + + + +
+

Update

+

2022-15-03: Since this was posted the CDC has updated how county level diabetes prevalance is calculated. The data presented here is using previous calcualtions and may no longer be correct. More can be read here

+
+
+

Abstract

+

Diabetes is growing at an epidemic rate in the United States. In North Carolina alone, diabetes and prediabetes cost an estimated $10.9 billion each year (American Diabetes Asssociation, 2015). This post introduces the exploration of the Diabetes epidemic in North Carolina. Through a series of posts this project will examine various public data available on diabetes and explore possible solutions to address the rise of diabetes in North Carolina. This investigation stems from the Capstone project of my Health Care Informatics Masters program. This post will answer the following questions:

+
+
    +
  1. What is the overall trend of diabetes prevalence in the United States?
  2. +
+
+
+
    +
  1. What is the trend of diabetes at a State Level and how does diabetes prevalence vary by state and region?
  2. +
+
+
+
    +
  1. How do trends in diabetes prevalence vary across counties of North Carolina?
  2. +
+
+
+
    +
  1. In which counties of North Carolina does the largest change in diabetes prevalence occur?
  2. +
+
+
+
    +
  1. How does change in diabetes prevalence compare between rural and urban counties?
  2. +
+
+
+
+

Enviroment

+

This section contains technical information for deeper analysis and reproduction. Casual readers are invited to skip it.

+

Packages used in this report.

+
+
+Code +
# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
+library(magrittr) # enables piping : %>%
+library(dplyr)    # data wrangling
+library(ggplot2)  # graphs
+library(tidyr)    # data tidying
+library(maps)
+library(mapdata)
+library(sf)
+library(readr)
+
+
+

Definitions of global object (file paths, factor levels, object groups ) used throughout the report.

+
+
+Code +
#set ggplot theme
+ggplot2::theme_set(theme_bw())
+
+
+
+
+

Data

+

The data for this exploration comes from several sources:

+
    +
  1. The Diabetes data set for state and county levels were sourced from the US Diabetes Surveillance System; Division of Diabetes Translation - Centers for Disease Control and Prevention. The data was downloaded one year per file, and compiled into a single data set for analysis.

  2. +
  3. The Diabetes data set for National level data were sourced from the CDC’s National Health Interview Survey (NHIS)

  4. +
  5. The list of rural counties was taken from The Office of Rural Health Policy, the list is available here

  6. +
+ +
+
+Code +
# load the data, and have all column names in lowercase
+
+nc_diabetes_data_raw <- read_csv("https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/derived/nc-diabetes-data.csv") %>% 
+  rename_all(tolower)
+
+us_diabetes_data_raw <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/raw/us_diabetes_totals.csv"
+                                 ,skip = 2)
+
+rural_counties <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/rural-counties.csv")
+
+county_centers_raw <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/nc_county_centers.csv", col_names = c("county", "lat","long"))
+
+diabetes_atlas_data_raw <- read_csv("https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/DiabetesAtlasData.csv"
+                                    ,col_types = cols(LowerLimit = col_skip(), 
+                                                     UpperLimit = col_skip(),
+                                                     Percentage = col_double()), skip = 2)
+
+
+ +
+
+Code +
# load in both US State Map and NC County Map
+
+nc_counties_map_raw <- st_as_sf(map("county",region = "north carolina", plot = FALSE,fill = TRUE)) %>% 
+  mutate_at("ID", ~stringr::str_remove(.,"north carolina,"))
+
+state_map_raw <- st_as_sf(map("state",plot = FALSE,fill = TRUE ))
+
+nc_cities <-  st_as_sf(read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/nc_cities.csv"),
+                       coords = c("long", "lat")
+                       ,remove = FALSE
+                       ,agr = "constant"
+                       ,crs = 4326)
+
+
+
+
+

Data Manipulation

+

The combined data used in this anaylsis can be downloaded here. The only tweaks done here are to combine the rural counties column, and the data for creating maps.

+
+

Tweaks

+ +
+
+Code +
county_centers <- county_centers_raw %>% 
+  mutate_all(~stringr::str_replace_all(.,
+                                       c("\\°"  = ""
+                                         ,"\\+" = ""
+                                         ,"\\–" = "-"
+                                       )
+  ) 
+  ) %>%
+  mutate(across(c("lat","long"), ~iconv(.,from = 'UTF-8', to = 'ASCII//TRANSLIT'))
+         ,across(c("lat","long"),~stringr::str_remove_all(.,"\\?"))) %>% 
+  mutate_at(c("lat","long"),as.numeric) %>%
+  mutate(across("long", ~(. * -1))) %>% 
+  mutate_at("county", tolower)
+
+
+
+
+us_diabetes_data <- us_diabetes_data_raw %>% 
+  filter(Year >= 2000) %>% 
+  select( "Year","Total - Percentage") %>% 
+  rename(year = Year , us_pct = `Total - Percentage`)
+
+diabetes_atlas_data <- diabetes_atlas_data_raw %>% 
+  mutate_at("State", tolower) %>% 
+  filter(Year >= 2000)
+
+state_map_abb <- state_map_raw %>% 
+  left_join(read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/state-abb.csv") %>% 
+              mutate_at("state", tolower)
+            ,by = c("ID" = "state") )
+
+
+
+
+

Merge

+ +
+
+Code +
#join US totals to NC data 
+
+nc_diabetes_data <- nc_diabetes_data_raw %>% 
+  mutate_at("county", ~stringr::str_replace_all(.,"Mcdowell","McDowell")) %>% 
+  mutate(
+    rural = county %in% rural_counties$rural_counties
+  ) %>% 
+  mutate_at("county",tolower) %>% 
+  left_join(us_diabetes_data)
+
+
+nc_counties_map <- nc_counties_map_raw %>% 
+  left_join(nc_diabetes_data, by = c("ID" = "county")) %>% 
+  left_join(county_centers, by = c("ID" = "county")) %>% 
+  rename(
+    center_long = long
+    ,center_lat = lat)
+
+state_map <- state_map_abb %>% 
+  left_join(diabetes_atlas_data, by = c("ID" = "State")) %>% 
+  rename_all(tolower)
+
+
+
+
+
+

Overall - National Level

+
+
+Code +
us_diabetes_data <- us_diabetes_data %>% 
+  mutate(
+    change = lead(us_pct) - us_pct
+    ,change = if_else(change > 0, TRUE, FALSE)
+  ) %>% 
+  mutate_at("change", ~stringr::str_replace_na(.,"NA"))
+
+
+
+o_g1 <- us_diabetes_data %>% 
+  ggplot(aes(x = year, y = us_pct)) +
+  geom_line(color= "#D95F02") +
+  # geom_line(aes(color = change, group = 1)) +
+  geom_point(shape = 21, size = 3,color= "#D95F02") +
+  # geom_point(aes(color = change),shape = 21, size = 3) +
+  scale_color_manual(values = c(
+    "TRUE" = "#D95F02"
+    ,"FALSE" = "#7570B3"
+  ), guide = FALSE) +
+  labs(
+    title    = "Percentage of Diagnosed Diabetes in Adults (18+), National Level"
+    ,x       = NULL
+    ,y       = NULL
+    ,caption = "Note: Data from the CDC's National Health Interview Survey (NHIS)"
+  )
+
+o_g1
+
+
+

+
+
+

Overall, the national average for diagnosed diabetes sharply rose through the early 2000’s, leveling off around 2010. These numbers however, are estimates based on the self-reported response to the CDC’s National Health Interview Survey, and do not represent the actual confirmed diagnoses. The CDC estimates that 1 in 5 adults have undiagnosed diabetes, therefore the numbers reported by the NHIS are likely to underestimate the true prevalence (Centers for Disease Control and Prevention, 2020).

+
+
+

Overall - State Level

+

State and County level data on diabetes prevalence are taken from the CDC’s Behavioral Risk Factor Surveillance System (BRFSS). These results are based on the question “Has a doctor, nurse, or other health professional ever told you that you have diabetes?”. Women who only experienced diabetes during pregnancy were excluded from the counts. The BRFSS is an ongoing, monthly telephone survey of the non-institutionalized adults (aged 18 years or older) in each state. The year 2011 saw a major change to the methodology of the survey, which started to include homes without a landline phone. This change was expected to increase coverage of lower income, lower educational levels, and younger age groups, because these groups often exclusively rely on cellular telephones for personal communication.(Pierannunzi et al., 2012)

+
+
+Code +
s_g1 <- state_map %>% 
+  st_drop_geometry() %>% 
+  ggplot(aes(x = year, y = percentage, color = region)) +
+  geom_line(aes(group = id ),alpha = 0.3,na.rm = TRUE) +
+  geom_smooth(method = "lm", se = FALSE) +
+  ggpmisc::stat_poly_eq(formula = y ~ + x ,
+                        aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
+                        parse = TRUE) +
+  geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") +
+  scale_color_brewer(palette    = "Dark2"
+                     ,direction = -1
+                     ,labels    = snakecase::to_title_case
+  ) +
+  labs(
+    title    = "Percentage of Diagnosed Diabetes in Adults (18+) \nby State and Region"
+    ,x       = NULL
+    ,y       = NULL
+    ,color   = "Region"
+    ,caption = "Regions from US Census Bureau"
+  ) 
+
+s_g1
+
+
+

+
+
+

The above graph shows diabetes prevalence trends by state, grouped into regions based on the US Census classification regions. While all regions of the United states show positive growth in diabetes prevalence, the south exhibits a slightly higher growth rate, as well as the highest prevalence.

+
+
+Code +
s_g2 <- state_map %>% 
+  st_drop_geometry() %>% 
+  filter(region == "south") %>% 
+  mutate_at("id", ~snakecase::to_title_case(.)) %>% 
+  ggplot(aes(x = year, y = percentage)) +
+  geom_line(aes(group = id ),na.rm = TRUE, color= "#D95F02") +
+  gghighlight::gghighlight(id == "North Carolina", label_params = list(vjust = 3)) +
+  scale_y_continuous(breaks = seq(5,13,2)) +
+  scale_x_continuous(minor_breaks = seq(2000,2016,1)) +
+  labs(
+    title    = "Percentage of Diagnosed Diabetes in Adults (18+) \nSouth Region"
+    ,x       = NULL
+    ,y       = NULL
+    ,caption = "Regions from US Census Bureau"
+  ) +
+  theme()
+
+s_g2
+
+
+

+
+
+

When focusing on the south region, North Carolina falls close to the middle of diabetes prevalence.

+
+
+

Overall - North Carolina

+

When examining the trajectory for North Carolina, we can see that it has been consistently higher than national average . We see that in 2016 there was a large spike in diagnosed cases; unfortunately this is the last available year so it is unclear whether the upward trend continues. The graph below compares state-level average to the national average. Notice that the trend line is slightly higher than in the previous graphs: this is due to the age cut offs used for National and State level data vs County Level data. Previous data used 18 years of age as a cutoff for classifying adults, whereas the county level data uses 20. Due to removing 18- and 19-year-olds from the population, who typically have less diagnosed cases of diabetes than those of older ages, the computed prevalence increases slightly.

+
+
+Code +
d1 <- nc_diabetes_data %>% 
+  group_by(year) %>% 
+  summarise(
+    pct = mean(percentage)
+    ,us_pct = mean(us_pct)
+  ) %>% 
+  pivot_longer(
+    cols       = c("pct", "us_pct")
+    ,names_to  = "metric"
+    ,values_to = "values"
+  ) %>% 
+  mutate(
+    metric = factor(metric
+                    ,levels = c("pct","us_pct")
+                    ,labels = c("NC", "National"))
+  )
+
+nc_g1 <- d1 %>% 
+  ggplot(aes(x = year, y = values, color = metric)) +
+  geom_line() +
+  geom_point(shape = 21, size = 3) +
+  geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") +
+  scale_y_continuous(labels = function(x) paste0(x, "%")) +
+  scale_color_brewer(palette = "Dark2") +
+  labs(
+    x      = NULL
+    ,y     = NULL
+    ,color = NULL
+    ,title = "Percent of Adults (20+) with Diagnosed Diabetes"
+  )
+
+nc_g1 
+
+
+

+
+
+

We see a spike in 2016, the last year for which the data are available. However, we should be careful with our interpretation of this pattern, because the examination of the county-level trajectories reveals an aberration in the trend that requires a more rigorous investigation.

+
+
+Code +
nc_g1a <- nc_diabetes_data %>% 
+  ggplot(aes(x = year, y = percentage)) +
+  geom_line(aes(group = county),alpha = 0.4) +
+  labs(
+    x = NULL
+    ,y = NULL
+    ,color = NULL
+  )
+
+nc_g1a
+
+
+

+
+
+

While all of North Carolina has a higher prevalence than the national average, rural counties have systematically higher prevalence of diabetes than urban counties. Note that after 2011 both Urban and Rural counties break the upward trend exhibited in the previous 5 years. This could be explained by the addition of cell phones to the BRFS Survey as many rural areas are often lower income areas and may only rely on a cell phone for communication. As mentioned previously there is an odd spike in case in 2016 that can’t be explained by current documentation. For the purpose of this evaluation 2016 will be excluded from the county level data since the odd trend can not be explained and no further data is available to determine if this is a real spike or could be attributed to methodology change or data quality.

+
+
+Code +
d2 <- nc_diabetes_data %>% 
+  select(-us_pct) %>% 
+  mutate(
+    pct_rural  = if_else(rural == TRUE, percentage, FALSE)
+    ,pct_urban = if_else(rural == FALSE, percentage, FALSE)
+  ) %>% 
+  select(-countyfips,-percentage) %>% 
+  group_by(year) %>% 
+  summarise(
+    pct_rural = mean(pct_rural,na.rm = TRUE)
+    ,pct_urban = mean(pct_urban,na.rm = TRUE)
+  ) %>% left_join(us_diabetes_data) %>% 
+  pivot_longer(
+    cols       = c("us_pct", "pct_rural","pct_urban")
+    ,names_to  = "metric"
+    ,values_to = "value"
+    ,values_drop_na = TRUE
+  ) %>% 
+  mutate(
+    metric = factor(metric,
+                    levels  = c("pct_rural","pct_urban","us_pct")
+                    ,labels = c("Rural","Urban","US")
+    )
+  )
+
+nc_g2 <- d2 %>% ggplot(aes(x = year, y = value, color = metric)) +
+  geom_line() +
+  geom_point(shape = 21, size = 3) +
+  geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") +
+  scale_y_continuous(labels = function(x) paste0(x, "%")) +
+  scale_color_brewer(palette = "Dark2") +
+  labs(
+    x      = NULL
+    ,y     = NULL
+    ,color = NULL
+    ,title = "Percent of Adults (20+) with Diagnosed Diabetes \nDisplaying Rural vs Urban"
+  )
+
+nc_g2
+
+
+

+
+
+
+
+

By County - Geographical

+

County level data first became available in 2004, three years of data is used to arrive at these estimates. For example, the 2006 estimates were computed using the data from 2005, 2006, and 2007 BRFS survey rounds. The county-level estimates were based on indirect model-dependent estimates using Bayesian multilevel modeling techniques(JNK, 2003 ; Barker et al., 2013). This model-dependent approach employs a statistical model that “borrows strength” in making an estimate for one county from BRFSS data collected in other counties and states. Multilevel Binomial regression models with random effects of demographic variables (age 20-44, 45-64, >=65; race/ethnicity; sex) at the county-level were developed. Estimates were adjusted for age to the 2000 US standard population using age groups of 20-44, 45-64, and 65 or older(Klein & Schoenborn, 2001).

+
+
+Code +
g50 <- nc_diabetes_data %>% 
+  filter(year < 2015) %>% 
+  mutate(
+    rural = factor(rural
+                   ,levels = c(TRUE,FALSE)
+                   ,labels = c("Rural", "Urban")
+    )
+  ) %>% 
+  ggplot(aes(x = year, y = percentage, color = rural)) +
+  geom_line(aes(group = county),alpha = 0.3) +
+  geom_smooth(aes(group = rural), method = "loess", se= FALSE, size = 1.1) +
+  scale_color_brewer(palette = "Dark2") +
+  labs(
+    title = "Percent of Adults (20+) with Diagnosed Diabetes \nAll North Carolina Counties"
+    ,x = NULL
+    ,y = NULL
+    ,color = NULL
+  )
+
+g50
+
+
+

+
+
+

When viewing all county trend lines together, we see that the loess line for both urban and rural follows a similar trend for the time period.

+

The following graphs displays the total estimated prevalence of Diabetes in each off the 100 North Carolina counties. To keep the scaling consistent between the graphs, we binned the estimates into 6 intervals of the same size. Rural counties are highlighted with a stronger border line as well as a letter “R” in respective geographic centers. These graphs allow us to view geographical clusters of diabetes prevalence.

+
+
+Code +
nc_counties_map_binned <- nc_counties_map %>% 
+  filter(year < 2015) %>% 
+  mutate(
+    bin = dlookr::binning(.$percentage, nbins = 6 ,type = "equal")
+    ,bin = forcats::fct_recode(bin
+                               ,"6.5   - 7.9"  =  "[6.5,7.97]"
+                               ,"8.0  - 9.4" =  "(7.97,9.43]" 
+                               ,"9.5  - 10.9" =  "(9.43,10.9]" 
+                               ,"11.0 - 12.4" =  "(10.9,12.4]"
+                               ,"12.5 - 13.8" =  "(12.4,13.8]"  
+                               ,"13.9 - 15.3" =  "(13.8,15.3]"
+    )
+  )
+
+c_g1 <- nc_counties_map_binned %>% 
+  filter(year %in% c(2006,2014)) %>% 
+  ggplot() +
+  geom_sf() + #blank geom_sf keeps gridlines from overlapping map
+  geom_sf(aes(fill = bin,color = rural)) +
+  geom_sf(data = nc_cities) +
+  ggrepel::geom_text_repel(data = nc_cities, 
+                           aes(x = long, y = lat, label = city)
+                           ,nudge_y = c(-1,1,1,-1,1)
+                           ,nudge_x = c(0,0,0,-1,0)
+  ) +
+  geom_text(data = . %>% filter(rural == TRUE)
+            ,aes(x = center_long, y = center_lat)
+            ,label = "R"
+            ,color = "#696969"
+  ) +
+  coord_sf(xlim = c(-84.5,-75.5), ylim = c(33.75,37)) +
+  facet_wrap(~year) +
+  scale_fill_viridis_d(alpha = 0.6, direction = -1) +
+  scale_color_manual(
+    values = c(
+      "FALSE" = "gray"
+      ,"TRUE" = "black"
+    ),guide = 'none') +
+  labs(
+    title = "Estimated Diabetes in Adults (20+) by County"
+    ,fill = "Percentage"
+    ,y    = NULL
+    ,x    = NULL
+  ) +
+  theme(
+    panel.background = element_rect(fill = "aliceblue")
+    ,panel.grid.major = element_line(color = "#D4D4D4", linetype = "dashed", 
+                                     size = 0.5)
+    ,legend.position = "bottom"
+    ,plot.title = element_text(hjust = 0.5)
+  )
+
+c_g1
+
+
+

+
+
+

The following box plot displays the distribution of estimated cases by county from 2006 to 2014. For all years of current data the mean of rural counties is higher then that of their Urban counterparts.

+
+
+Code +
c_g1c <- nc_counties_map %>% 
+  mutate(
+    rural = factor(rural
+                   ,levels = c(TRUE,FALSE)
+                   ,labels = c("Rural", "Urban")
+    )) %>% 
+  filter(year < 2015) %>%
+  ggplot(aes(x = year, y = percentage, group = interaction(year,rural), fill = rural)) +
+  geom_boxplot(alpha = 0.5) +
+  scale_fill_brewer(palette = "Dark2") +
+  scale_x_continuous(breaks = seq(2004,2014,2)) +
+  labs(
+    x      = NULL
+    ,y     = NULL
+    ,fill  = NULL
+    ,title = "Distribution  of Estimated Cases by County 2006 - 2014"
+  )
+
+c_g1c
+
+
+

+
+
+
+
+

By County - Percent Change

+

The following graphs display the overall change in estimated prevalence between 2006 to 2014.

+
+
+Code +
d3 <- nc_counties_map %>% 
+  st_drop_geometry() %>% 
+  filter(year %in% c(2006,2014)) %>% 
+  select(-countyfips,-us_pct) %>% 
+  pivot_wider(names_from = "year"
+              ,values_from = "percentage") %>% 
+  mutate(
+    pct_p  = `2014` - `2006`
+    ,pct_c = ((`2014` - `2006`)/`2006`) * 100
+  ) %>% 
+  left_join(nc_counties_map_raw) %>% 
+  st_as_sf()
+
+
+c_g4 <- d3 %>% 
+  ggplot() +
+  geom_sf() + #blank geom_sf keeps gridlines from overlapping map
+  geom_sf(aes(fill = pct_c ,color = rural)) +
+  geom_sf(data = nc_cities) +
+  ggrepel::geom_text_repel(data = nc_cities, 
+                           aes(x = long, y = lat, label = city)
+                           ,nudge_y = c(-1,1,1,-1,1)
+                           ,nudge_x = c(0,0,0,-1,0)
+  ) +
+  geom_text(data = . %>% filter(rural == TRUE)
+            ,aes(x = center_long, y = center_lat)
+            ,label = "R"
+            ,color = "#696969"
+  ) +
+  # scale_fill_viridis_c(alpha = 0.6, direction = -1) +
+  scale_fill_gradient2(
+    low = "#d01c8b"
+    ,mid = "#f7f7f7"
+    ,high = "#4dac26"
+    ,midpoint = 0
+  ) +
+  scale_color_manual(
+    values = c(
+      "FALSE" = "gray"
+      ,"TRUE" = "black"
+    ),guide = 'none') +
+  labs(
+    title = "Percentage Change of Diagnosed Diabetes 2006-2014"
+    ,fill = "Percentage"
+    ,y    = NULL
+    ,x    = NULL
+  ) +
+  theme(
+    panel.background = element_rect(fill = "aliceblue")
+    ,panel.grid.major = element_line(color = "#D4D4D4", linetype = "dashed", 
+                                     size = 0.5)
+  )
+
+c_g4
+
+
+

+
+
+

The following chart displays the density curve of the percentage change for both rural and urban counties. It is notable that the mean of change for Urban counties is actually higher than the mean for rural counties. However, we also see that most change for both regions is positive growth. In fact only 16 rural, and 10 Urban counties experienced negative change in the given time frame. While 35 rural and 34 urban counties experience growth in the same period.

+
+
+Code +
d4 <- d3 %>% 
+  st_drop_geometry() %>% 
+  mutate(
+    rural = factor(rural
+                   ,levels = c(TRUE,FALSE)
+                   ,labels = c("Rural", "Urban")
+    )
+  )
+
+
+mean_d4 <- d4 %>% 
+  group_by(rural) %>% 
+  summarise(.groups = "keep"
+            ,pct_c = mean(pct_c)
+  )
+
+g51 <-  d4 %>% 
+  ggplot(aes(x = pct_c, fill = rural, y = ..density.., color = rural)) +
+  geom_histogram(binwidth = 5, position = "identity", alpha = 0.3) +
+  geom_density(alpha = 0.5) +
+  facet_wrap(~rural, ncol = 1)  +
+  geom_vline(aes(xintercept = pct_c), data = mean_d4) +
+  geom_text(aes(x = pct_c, y = 0.038, label = round(pct_c, 2))
+            ,data  = mean_d4
+            ,hjust = -0.15
+            ,size  = 5
+            ,color = "#000000") +
+  geom_vline(xintercept = 0, linetype = "dashed", color = "#696969") +
+  scale_color_brewer(palette = "Dark2", guide = NULL) +
+  scale_fill_brewer(palette = "Dark2", guide = NULL) +
+  labs(
+    x = "Percentage Change"
+    ,y = "Density"
+    ,fill = NULL
+  )
+g51
+
+
+

+
+
+
+
+

Conclusion and Next Steps

+

The original hypothesis of this report was that rural counties were growing at a higher rate then there urban counterparts. Through out this post it has been shown that this hypothesis is incorrect, just being a rural county does not indicate diabetes growth, in fact the growth rate throughout North Carolina has been consistent. Further posts will explore other reasons for these trends, as the current post merely explores the trends and differences using data visualizations, a more rigorous and formal evaluation of these comparison is in order.

+
+
+

Session information

+

===========================================================================

+

For the sake of documentation and reproducibility, the current report was rendered in the following environment. Click the line below to expand.

+
+ +Environment + +
+
+Code +
if( requireNamespace("devtools", quietly = TRUE) ) {
+  devtools::session_info()
+} else {
+  sessionInfo()
+} 
+
+
+
─ Session info ───────────────────────────────────────────────────────────────
+ setting  value
+ version  R version 4.2.1 (2022-06-23 ucrt)
+ os       Windows 10 x64 (build 22621)
+ system   x86_64, mingw32
+ ui       RTerm
+ language (EN)
+ collate  English_United States.utf8
+ ctype    English_United States.utf8
+ tz       America/New_York
+ date     2023-10-12
+ pandoc   3.1.8 @ C:/PROGRA~1/Pandoc/ (via rmarkdown)
+
+─ Packages ───────────────────────────────────────────────────────────────────
+ package           * version date (UTC) lib source
+ bit                 4.0.4   2020-08-04 [1] CRAN (R 4.2.2)
+ bit64               4.0.5   2020-08-30 [1] CRAN (R 4.2.2)
+ cachem              1.0.6   2021-08-19 [1] CRAN (R 4.2.2)
+ callr               3.7.3   2022-11-02 [1] CRAN (R 4.2.2)
+ class               7.3-20  2022-01-16 [2] CRAN (R 4.2.1)
+ classInt            0.4-10  2023-09-05 [1] CRAN (R 4.2.3)
+ cli                 3.4.1   2022-09-23 [1] CRAN (R 4.2.2)
+ colorspace          2.0-3   2022-02-21 [1] CRAN (R 4.2.2)
+ confintr            1.0.2   2023-06-04 [1] CRAN (R 4.2.3)
+ crayon              1.5.2   2022-09-29 [1] CRAN (R 4.2.2)
+ crul                1.4.0   2023-05-17 [1] CRAN (R 4.2.3)
+ curl                4.3.3   2022-10-06 [1] CRAN (R 4.2.2)
+ DBI                 1.1.3   2022-06-18 [1] CRAN (R 4.2.2)
+ devtools            2.4.5   2022-10-11 [1] CRAN (R 4.2.2)
+ digest              0.6.30  2022-10-18 [1] CRAN (R 4.2.2)
+ dlookr              0.6.2   2023-07-01 [1] CRAN (R 4.2.3)
+ dplyr             * 1.1.3   2023-09-03 [1] CRAN (R 4.2.3)
+ e1071               1.7-13  2023-02-01 [1] CRAN (R 4.2.3)
+ ellipsis            0.3.2   2021-04-29 [1] CRAN (R 4.2.2)
+ evaluate            0.21    2023-05-05 [1] CRAN (R 4.2.3)
+ extrafont           0.19    2023-01-18 [1] CRAN (R 4.2.2)
+ extrafontdb         1.0     2012-06-11 [1] CRAN (R 4.2.0)
+ fansi               1.0.3   2022-03-24 [1] CRAN (R 4.2.2)
+ farver              2.1.1   2022-07-06 [1] CRAN (R 4.2.2)
+ fastmap             1.1.0   2021-01-25 [1] CRAN (R 4.2.2)
+ fontBitstreamVera   0.1.1   2017-02-01 [1] CRAN (R 4.2.0)
+ fontLiberation      0.1.0   2016-10-15 [1] CRAN (R 4.2.0)
+ fontquiver          0.2.1   2017-02-01 [1] CRAN (R 4.2.3)
+ forcats             1.0.0   2023-01-29 [1] CRAN (R 4.2.3)
+ Formula             1.2-5   2023-02-24 [1] CRAN (R 4.2.2)
+ fs                  1.5.2   2021-12-08 [1] CRAN (R 4.2.2)
+ gdtools             0.3.3   2023-03-27 [1] CRAN (R 4.2.3)
+ generics            0.1.3   2022-07-05 [1] CRAN (R 4.2.2)
+ gfonts              0.2.0   2023-01-08 [1] CRAN (R 4.2.3)
+ gghighlight         0.4.0   2022-10-16 [1] CRAN (R 4.2.3)
+ ggplot2           * 3.4.2   2023-04-03 [1] CRAN (R 4.2.3)
+ ggpmisc             0.5.4-1 2023-08-13 [1] CRAN (R 4.2.3)
+ ggpp                0.5.4   2023-08-12 [1] CRAN (R 4.2.3)
+ ggrepel             0.9.3   2023-02-03 [1] CRAN (R 4.2.3)
+ glue                1.6.2   2022-02-24 [1] CRAN (R 4.2.2)
+ gridExtra           2.3     2017-09-09 [1] CRAN (R 4.2.2)
+ gtable              0.3.3   2023-03-21 [1] CRAN (R 4.2.3)
+ hms                 1.1.3   2023-03-21 [1] CRAN (R 4.2.3)
+ hrbrthemes          0.8.0   2020-03-06 [1] CRAN (R 4.2.3)
+ htmltools           0.5.4   2022-12-07 [1] CRAN (R 4.2.2)
+ htmlwidgets         1.6.2   2023-03-17 [1] CRAN (R 4.2.3)
+ httpcode            0.3.0   2020-04-10 [1] CRAN (R 4.2.3)
+ httpuv              1.6.8   2023-01-12 [1] CRAN (R 4.2.2)
+ httr                1.4.6   2023-05-08 [1] CRAN (R 4.2.3)
+ inum                1.0-5   2023-03-09 [1] CRAN (R 4.2.3)
+ jsonlite            1.8.3   2022-10-21 [1] CRAN (R 4.2.2)
+ kableExtra          1.3.4   2021-02-20 [1] CRAN (R 4.2.2)
+ KernSmooth          2.23-20 2021-05-03 [2] CRAN (R 4.2.1)
+ knitr               1.43    2023-05-25 [1] CRAN (R 4.2.3)
+ labeling            0.4.2   2020-10-20 [1] CRAN (R 4.2.0)
+ later               1.3.0   2021-08-18 [1] CRAN (R 4.2.2)
+ lattice             0.20-45 2021-09-22 [2] CRAN (R 4.2.1)
+ libcoin             1.0-10  2023-09-27 [1] CRAN (R 4.2.3)
+ lifecycle           1.0.3   2022-10-07 [1] CRAN (R 4.2.2)
+ magrittr          * 2.0.3   2022-03-30 [1] CRAN (R 4.2.2)
+ mapdata           * 2.3.1   2022-11-01 [1] CRAN (R 4.2.3)
+ maps              * 3.4.1   2022-10-30 [1] CRAN (R 4.2.3)
+ MASS                7.3-60  2023-05-04 [1] CRAN (R 4.2.3)
+ Matrix              1.5-4.1 2023-05-18 [1] CRAN (R 4.2.3)
+ MatrixModels        0.5-1   2022-09-11 [1] CRAN (R 4.2.3)
+ memoise             2.0.1   2021-11-26 [1] CRAN (R 4.2.2)
+ mgcv                1.8-40  2022-03-29 [2] CRAN (R 4.2.1)
+ mime                0.12    2021-09-28 [1] CRAN (R 4.2.0)
+ miniUI              0.1.1.1 2018-05-18 [1] CRAN (R 4.2.2)
+ munsell             0.5.0   2018-06-12 [1] CRAN (R 4.2.2)
+ mvtnorm             1.2-2   2023-06-08 [1] CRAN (R 4.2.3)
+ nlme                3.1-157 2022-03-25 [2] CRAN (R 4.2.1)
+ pagedown            0.20    2022-12-13 [1] CRAN (R 4.2.3)
+ partykit            1.2-20  2023-04-14 [1] CRAN (R 4.2.3)
+ pillar              1.9.0   2023-03-22 [1] CRAN (R 4.2.3)
+ pkgbuild            1.4.2   2023-06-26 [1] CRAN (R 4.2.1)
+ pkgconfig           2.0.3   2019-09-22 [1] CRAN (R 4.2.2)
+ pkgload             1.3.2   2022-11-16 [1] CRAN (R 4.2.2)
+ polynom             1.4-1   2022-04-11 [1] CRAN (R 4.2.3)
+ prettyunits         1.1.1   2020-01-24 [1] CRAN (R 4.2.2)
+ processx            3.8.1   2023-04-18 [1] CRAN (R 4.2.3)
+ profvis             0.3.8   2023-05-02 [1] CRAN (R 4.2.3)
+ promises            1.2.0.1 2021-02-11 [1] CRAN (R 4.2.2)
+ proxy               0.4-27  2022-06-09 [1] CRAN (R 4.2.3)
+ ps                  1.7.5   2023-04-18 [1] CRAN (R 4.2.3)
+ purrr               1.0.1   2023-01-10 [1] CRAN (R 4.2.3)
+ quantreg            5.95    2023-04-08 [1] CRAN (R 4.2.3)
+ R6                  2.5.1   2021-08-19 [1] CRAN (R 4.2.2)
+ RColorBrewer        1.1-3   2022-04-03 [1] CRAN (R 4.2.0)
+ Rcpp                1.0.9   2022-07-08 [1] CRAN (R 4.2.2)
+ reactable           0.4.4   2023-03-12 [1] CRAN (R 4.2.3)
+ readr             * 2.1.3   2022-10-01 [1] CRAN (R 4.2.2)
+ remotes             2.4.2   2021-11-30 [1] CRAN (R 4.2.2)
+ rlang               1.1.0   2023-03-14 [1] CRAN (R 4.2.3)
+ rmarkdown           2.22    2023-06-01 [1] CRAN (R 4.2.3)
+ rpart               4.1.16  2022-01-24 [2] CRAN (R 4.2.1)
+ rstudioapi          0.14    2022-08-22 [1] CRAN (R 4.2.2)
+ Rttf2pt1            1.3.12  2023-01-22 [1] CRAN (R 4.2.2)
+ rvest               1.0.3   2022-08-19 [1] CRAN (R 4.2.3)
+ scales              1.2.1   2022-08-20 [1] CRAN (R 4.2.2)
+ sessioninfo         1.2.2   2021-12-06 [1] CRAN (R 4.2.2)
+ sf                * 1.0-14  2023-07-11 [1] CRAN (R 4.2.3)
+ shiny               1.7.4   2022-12-15 [1] CRAN (R 4.2.1)
+ showtext            0.9-6   2023-05-03 [1] CRAN (R 4.2.3)
+ showtextdb          3.0     2020-06-04 [1] CRAN (R 4.2.3)
+ snakecase           0.11.0  2019-05-25 [1] CRAN (R 4.2.2)
+ SparseM             1.81    2021-02-18 [1] CRAN (R 4.2.0)
+ stringi             1.7.8   2022-07-11 [1] CRAN (R 4.2.1)
+ stringr             1.5.0   2022-12-02 [1] CRAN (R 4.2.3)
+ survival            3.3-1   2022-03-03 [2] CRAN (R 4.2.1)
+ svglite             2.1.1   2023-01-10 [1] CRAN (R 4.2.2)
+ sysfonts            0.8.8   2022-03-13 [1] CRAN (R 4.2.3)
+ systemfonts         1.0.4   2022-02-11 [1] CRAN (R 4.2.2)
+ tibble              3.2.1   2023-03-20 [1] CRAN (R 4.2.3)
+ tidyr             * 1.3.0   2023-01-24 [1] CRAN (R 4.2.3)
+ tidyselect          1.2.0   2022-10-10 [1] CRAN (R 4.2.2)
+ tzdb                0.3.0   2022-03-28 [1] CRAN (R 4.2.2)
+ units               0.8-4   2023-09-13 [1] CRAN (R 4.2.3)
+ urlchecker          1.0.1   2021-11-30 [1] CRAN (R 4.2.2)
+ usethis             2.2.1   2023-06-23 [1] CRAN (R 4.2.3)
+ utf8                1.2.2   2021-07-24 [1] CRAN (R 4.2.2)
+ vctrs               0.6.2   2023-04-19 [1] CRAN (R 4.2.3)
+ viridisLite         0.4.2   2023-05-02 [1] CRAN (R 4.2.3)
+ vroom               1.6.0   2022-09-30 [1] CRAN (R 4.2.2)
+ webshot             0.5.4   2022-09-26 [1] CRAN (R 4.2.2)
+ withr               2.5.0   2022-03-03 [1] CRAN (R 4.2.2)
+ xfun                0.39    2023-04-20 [1] CRAN (R 4.2.3)
+ xml2                1.3.5   2023-07-06 [1] CRAN (R 4.2.3)
+ xtable              1.8-4   2019-04-21 [1] CRAN (R 4.2.2)
+ yaml                2.3.6   2022-10-18 [1] CRAN (R 4.2.1)
+
+ [1] C:/Users/belangew/AppData/Local/R/win-library/4.2
+ [2] C:/Program Files/R/R-4.2.1/library
+
+──────────────────────────────────────────────────────────────────────────────
+
+
+
+
+
+ + + + +
+ +

References

+
+American Diabetes Asssociation. (2015). The burden of diabetes in north carolina. http://main.diabetes.org/dorg/PDFs/Advocacy/burden-of-diabetes/north-carolina.pdf +
+
+Barker, L. E., Thompson, T. J., Kirtland, K. A., Boyle, J. P., Geiss, L. S., McCauley, M. M., & Albright, A. L. (2013). Bayesian small area estimates of diabetes incidence by united states county, 2009. Journal of Data Science, 11(1), 269–280. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4537395/ +
+
+Centers for Disease Control and Prevention. (2020). National diabetes statistics report. US Department of Health and Human Services. https://www.cdc.gov/diabetes/pdfs/data/statistics/national-diabetes-statistics-report.pdf +
+
+JNK, R. (2003). Small area estimation. https://onlinelibrary.wiley.com/doi/pdf/10.1002/0471722189.fmatter +
+
+Klein, R. J., & Schoenborn, C. A. (2001). Age adjustment using the 2000 projected u.s. population. Healthy People 2000 Stat Notes, 20, 1–9. +
+
+Pierannunzi, C., Town, M., Garvin, W., Shaw, F. E., & Balluz, L. (2012). Methodologic changes in the behavioral risk factor surveillance system in 2011 and potential effects on prevalence estimates. Morbidity and Mortality Weekly Report, 61(22), 410–413. https://www.cdc.gov/mmwr/pdf/wk/mm6122.pdf +
+

Reuse

Citation

BibTeX citation:
@online{belanger2020,
+  author = {Belanger, Kyle},
+  title = {Diabetes in {Rural} {North} {Carolina} : {Exploring}
+    {Prevalence} {Trends}},
+  date = {2020-06-25},
+  langid = {en}
+}
+
For attribution, please cite this work as:
+Belanger, K. (2020, June 25). Diabetes in Rural North Carolina : +Exploring Prevalence Trends. +
+ +
+ + + + \ No newline at end of file diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/c-g1-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/c-g1-1.png new file mode 100644 index 0000000..69fcf95 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/c-g1-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/c-g4-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/c-g4-1.png new file mode 100644 index 0000000..05d3f82 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/c-g4-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/county-boxplot-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/county-boxplot-1.png new file mode 100644 index 0000000..0557781 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/county-boxplot-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-data-aberration-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-data-aberration-1.png new file mode 100644 index 0000000..ccf05cf Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-data-aberration-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-g1-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-g1-1.png new file mode 100644 index 0000000..0a69aaa Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-g1-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-g2-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-g2-1.png new file mode 100644 index 0000000..8773072 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/nc-g2-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/o-g1-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/o-g1-1.png new file mode 100644 index 0000000..29c6126 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/o-g1-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/pct_p-histogram-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/pct_p-histogram-1.png new file mode 100644 index 0000000..03f7fc9 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/pct_p-histogram-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/s-g1-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/s-g1-1.png new file mode 100644 index 0000000..0e98c32 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/s-g1-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/s-g2-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/s-g2-1.png new file mode 100644 index 0000000..b41c73e Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/s-g2-1.png differ diff --git a/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/spaghetti-plot-1.png b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/spaghetti-plot-1.png new file mode 100644 index 0000000..3818de1 Binary files /dev/null and b/_site/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends_files/figure-html/spaghetti-plot-1.png differ diff --git a/_site/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html b/_site/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html index 3049741..09b0555 100644 --- a/_site/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html +++ b/_site/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html @@ -166,6 +166,12 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
+
+
Modified
+
+

October 12, 2023

+
+
diff --git a/_site/posts/post-with-code/index.html b/_site/posts/2022-07-28_making-maps-in-R/making-maps-in-r.html similarity index 85% rename from _site/posts/post-with-code/index.html rename to _site/posts/2022-07-28_making-maps-in-R/making-maps-in-r.html index 990dc3e..8d68b59 100644 --- a/_site/posts/post-with-code/index.html +++ b/_site/posts/2022-07-28_making-maps-in-R/making-maps-in-r.html @@ -6,10 +6,10 @@ - - + + -Kyle Belanger - Post With Code +Kyle Belanger - Making Maps in R - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
- -
-
-
-

Welcome To My Blog

-
-
news
-
-
-
- - -
- -
-
Author
-
-

Tristan O’Malley

-
-
- -
-
Published
-
-

September 30, 2023

-
-
- - -
- - -
- - - - -
- - - - -

This is the first post in a Quarto blog. Welcome!

-

-

Since this post doesn’t specify an explicit image, the first image in the post will be used in the listing page of posts.

- - - -
- -
- - - - \ No newline at end of file diff --git a/_site/posts/welcome/thumbnail.jpg b/_site/posts/welcome/thumbnail.jpg deleted file mode 100644 index 8e3107c..0000000 Binary files a/_site/posts/welcome/thumbnail.jpg and /dev/null differ diff --git a/_site/search.json b/_site/search.json index c916d96..aecdb0f 100644 --- a/_site/search.json +++ b/_site/search.json @@ -1,10 +1,66 @@ [ { - "objectID": "posts/welcome/index.html", - "href": "posts/welcome/index.html", - "title": "Welcome To My Blog", + "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html", + "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html", + "title": "Converting From Blogdown to Distill", "section": "", - "text": "This is the first post in a Quarto blog. Welcome!\n\nSince this post doesn’t specify an explicit image, the first image in the post will be used in the listing page of posts." + "text": "I have since converted this blog to a quarto blog, but am leaving this post up in case anyone finds it useful" + }, + { + "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#code-folding", + "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#code-folding", + "title": "Converting From Blogdown to Distill", + "section": "Code Folding", + "text": "Code Folding\nWhen I converted my blog on 12/30/2020, code folding was not included as an option by default in distill. At that time, an excellent package called Codefolder added the functionality. Since going live with the blog, code folding has been added to distill.1 Code folding is available for either the whole document or individual code sections. The default caption is “Show Code”, but instead of typing code_folding=TRUE, you can provide a string to change the caption.\n\n# Some awesome code \n# That does awesome things" + }, + { + "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#customizing-the-home-page", + "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#customizing-the-home-page", + "title": "Converting From Blogdown to Distill", + "section": "Customizing the Home Page", + "text": "Customizing the Home Page\nBy default, a distill blog’s home page will be the blog index page. I chose to edit my home page to be a landing page for myself and then have the blog index as a separate page. When creating a new blog, this is the default YAML header for your index page.\n---\ntitle: \"New Site\"\nsite: distill::distill_website\nlisting: posts\n---\nThe critical piece here is the line site: distill::distill_website. This line is what is needed to render the website. For my home page, I decided to use the package Postcard, which is used to generate simple landing pages. I won’t go into every step as there is already a great post by Alison Hill on how to do that. However, I will point out the most crucial part of the new index page the YAML header needs to contain these two lines.\noutput:\n postcards::trestles\nsite: distill::distill_website" + }, + { + "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#footnotes", + "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#footnotes", + "title": "Converting From Blogdown to Distill", + "section": "Footnotes", + "text": "Footnotes\n\n\nNote that as of publishing, code folding is only available in the development version of distill↩︎" + }, + { + "objectID": "posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.html", + "href": "posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.html", + "title": "Diabetes in Rural North Carolina : Exploring Prevalence Trends", + "section": "", + "text": "Update\n2022-15-03: Since this was posted the CDC has updated how county level diabetes prevalance is calculated. The data presented here is using previous calcualtions and may no longer be correct. More can be read here\n\n\nAbstract\nDiabetes is growing at an epidemic rate in the United States. In North Carolina alone, diabetes and prediabetes cost an estimated $10.9 billion each year (American Diabetes Asssociation, 2015). This post introduces the exploration of the Diabetes epidemic in North Carolina. Through a series of posts this project will examine various public data available on diabetes and explore possible solutions to address the rise of diabetes in North Carolina. This investigation stems from the Capstone project of my Health Care Informatics Masters program. This post will answer the following questions:\n\n\nWhat is the overall trend of diabetes prevalence in the United States?\n\n\n\n\nWhat is the trend of diabetes at a State Level and how does diabetes prevalence vary by state and region?\n\n\n\n\nHow do trends in diabetes prevalence vary across counties of North Carolina?\n\n\n\n\nIn which counties of North Carolina does the largest change in diabetes prevalence occur?\n\n\n\n\nHow does change in diabetes prevalence compare between rural and urban counties?\n\n\n\n\nEnviroment\nThis section contains technical information for deeper analysis and reproduction. Casual readers are invited to skip it.\nPackages used in this report.\n\n\nCode\n# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path\nlibrary(magrittr) # enables piping : %>%\nlibrary(dplyr) # data wrangling\nlibrary(ggplot2) # graphs\nlibrary(tidyr) # data tidying\nlibrary(maps)\nlibrary(mapdata)\nlibrary(sf)\nlibrary(readr)\n\n\nDefinitions of global object (file paths, factor levels, object groups ) used throughout the report.\n\n\nCode\n#set ggplot theme\nggplot2::theme_set(theme_bw())\n\n\n\n\nData\nThe data for this exploration comes from several sources:\n\nThe Diabetes data set for state and county levels were sourced from the US Diabetes Surveillance System; Division of Diabetes Translation - Centers for Disease Control and Prevention. The data was downloaded one year per file, and compiled into a single data set for analysis.\nThe Diabetes data set for National level data were sourced from the CDC’s National Health Interview Survey (NHIS)\nThe list of rural counties was taken from The Office of Rural Health Policy, the list is available here\n\n\n\n\nCode\n# load the data, and have all column names in lowercase\n\nnc_diabetes_data_raw <- read_csv(\"https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/derived/nc-diabetes-data.csv\") %>% \n rename_all(tolower)\n\nus_diabetes_data_raw <- read_csv(\"https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/raw/us_diabetes_totals.csv\"\n ,skip = 2)\n\nrural_counties <- read_csv(\"https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/rural-counties.csv\")\n\ncounty_centers_raw <- read_csv(\"https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/nc_county_centers.csv\", col_names = c(\"county\", \"lat\",\"long\"))\n\ndiabetes_atlas_data_raw <- read_csv(\"https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/DiabetesAtlasData.csv\"\n ,col_types = cols(LowerLimit = col_skip(), \n UpperLimit = col_skip(),\n Percentage = col_double()), skip = 2)\n\n\n\n\n\nCode\n# load in both US State Map and NC County Map\n\nnc_counties_map_raw <- st_as_sf(map(\"county\",region = \"north carolina\", plot = FALSE,fill = TRUE)) %>% \n mutate_at(\"ID\", ~stringr::str_remove(.,\"north carolina,\"))\n\nstate_map_raw <- st_as_sf(map(\"state\",plot = FALSE,fill = TRUE ))\n\nnc_cities <- st_as_sf(read_csv(\"https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/nc_cities.csv\"),\n coords = c(\"long\", \"lat\")\n ,remove = FALSE\n ,agr = \"constant\"\n ,crs = 4326)\n\n\n\n\nData Manipulation\nThe combined data used in this anaylsis can be downloaded here. The only tweaks done here are to combine the rural counties column, and the data for creating maps.\n\nTweaks\n\n\n\nCode\ncounty_centers <- county_centers_raw %>% \n mutate_all(~stringr::str_replace_all(.,\n c(\"\\\\°\" = \"\"\n ,\"\\\\+\" = \"\"\n ,\"\\\\–\" = \"-\"\n )\n ) \n ) %>%\n mutate(across(c(\"lat\",\"long\"), ~iconv(.,from = 'UTF-8', to = 'ASCII//TRANSLIT'))\n ,across(c(\"lat\",\"long\"),~stringr::str_remove_all(.,\"\\\\?\"))) %>% \n mutate_at(c(\"lat\",\"long\"),as.numeric) %>%\n mutate(across(\"long\", ~(. * -1))) %>% \n mutate_at(\"county\", tolower)\n\n\n\n\nus_diabetes_data <- us_diabetes_data_raw %>% \n filter(Year >= 2000) %>% \n select( \"Year\",\"Total - Percentage\") %>% \n rename(year = Year , us_pct = `Total - Percentage`)\n\ndiabetes_atlas_data <- diabetes_atlas_data_raw %>% \n mutate_at(\"State\", tolower) %>% \n filter(Year >= 2000)\n\nstate_map_abb <- state_map_raw %>% \n left_join(read_csv(\"https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/state-abb.csv\") %>% \n mutate_at(\"state\", tolower)\n ,by = c(\"ID\" = \"state\") )\n\n\n\n\nMerge\n\n\n\nCode\n#join US totals to NC data \n\nnc_diabetes_data <- nc_diabetes_data_raw %>% \n mutate_at(\"county\", ~stringr::str_replace_all(.,\"Mcdowell\",\"McDowell\")) %>% \n mutate(\n rural = county %in% rural_counties$rural_counties\n ) %>% \n mutate_at(\"county\",tolower) %>% \n left_join(us_diabetes_data)\n\n\nnc_counties_map <- nc_counties_map_raw %>% \n left_join(nc_diabetes_data, by = c(\"ID\" = \"county\")) %>% \n left_join(county_centers, by = c(\"ID\" = \"county\")) %>% \n rename(\n center_long = long\n ,center_lat = lat)\n\nstate_map <- state_map_abb %>% \n left_join(diabetes_atlas_data, by = c(\"ID\" = \"State\")) %>% \n rename_all(tolower)\n\n\n\n\n\nOverall - National Level\n\n\nCode\nus_diabetes_data <- us_diabetes_data %>% \n mutate(\n change = lead(us_pct) - us_pct\n ,change = if_else(change > 0, TRUE, FALSE)\n ) %>% \n mutate_at(\"change\", ~stringr::str_replace_na(.,\"NA\"))\n\n\n\no_g1 <- us_diabetes_data %>% \n ggplot(aes(x = year, y = us_pct)) +\n geom_line(color= \"#D95F02\") +\n # geom_line(aes(color = change, group = 1)) +\n geom_point(shape = 21, size = 3,color= \"#D95F02\") +\n # geom_point(aes(color = change),shape = 21, size = 3) +\n scale_color_manual(values = c(\n \"TRUE\" = \"#D95F02\"\n ,\"FALSE\" = \"#7570B3\"\n ), guide = FALSE) +\n labs(\n title = \"Percentage of Diagnosed Diabetes in Adults (18+), National Level\"\n ,x = NULL\n ,y = NULL\n ,caption = \"Note: Data from the CDC's National Health Interview Survey (NHIS)\"\n )\n\no_g1\n\n\n\n\n\nOverall, the national average for diagnosed diabetes sharply rose through the early 2000’s, leveling off around 2010. These numbers however, are estimates based on the self-reported response to the CDC’s National Health Interview Survey, and do not represent the actual confirmed diagnoses. The CDC estimates that 1 in 5 adults have undiagnosed diabetes, therefore the numbers reported by the NHIS are likely to underestimate the true prevalence (Centers for Disease Control and Prevention, 2020).\n\n\nOverall - State Level\nState and County level data on diabetes prevalence are taken from the CDC’s Behavioral Risk Factor Surveillance System (BRFSS). These results are based on the question “Has a doctor, nurse, or other health professional ever told you that you have diabetes?”. Women who only experienced diabetes during pregnancy were excluded from the counts. The BRFSS is an ongoing, monthly telephone survey of the non-institutionalized adults (aged 18 years or older) in each state. The year 2011 saw a major change to the methodology of the survey, which started to include homes without a landline phone. This change was expected to increase coverage of lower income, lower educational levels, and younger age groups, because these groups often exclusively rely on cellular telephones for personal communication.(Pierannunzi et al., 2012)\n\n\nCode\ns_g1 <- state_map %>% \n st_drop_geometry() %>% \n ggplot(aes(x = year, y = percentage, color = region)) +\n geom_line(aes(group = id ),alpha = 0.3,na.rm = TRUE) +\n geom_smooth(method = \"lm\", se = FALSE) +\n ggpmisc::stat_poly_eq(formula = y ~ + x ,\n aes(label = paste(..eq.label.., ..rr.label.., sep = \"~~~\")), \n parse = TRUE) +\n geom_vline(xintercept = 2011, linetype = \"dashed\", color = \"gray\") +\n scale_color_brewer(palette = \"Dark2\"\n ,direction = -1\n ,labels = snakecase::to_title_case\n ) +\n labs(\n title = \"Percentage of Diagnosed Diabetes in Adults (18+) \\nby State and Region\"\n ,x = NULL\n ,y = NULL\n ,color = \"Region\"\n ,caption = \"Regions from US Census Bureau\"\n ) \n\ns_g1\n\n\n\n\n\nThe above graph shows diabetes prevalence trends by state, grouped into regions based on the US Census classification regions. While all regions of the United states show positive growth in diabetes prevalence, the south exhibits a slightly higher growth rate, as well as the highest prevalence.\n\n\nCode\ns_g2 <- state_map %>% \n st_drop_geometry() %>% \n filter(region == \"south\") %>% \n mutate_at(\"id\", ~snakecase::to_title_case(.)) %>% \n ggplot(aes(x = year, y = percentage)) +\n geom_line(aes(group = id ),na.rm = TRUE, color= \"#D95F02\") +\n gghighlight::gghighlight(id == \"North Carolina\", label_params = list(vjust = 3)) +\n scale_y_continuous(breaks = seq(5,13,2)) +\n scale_x_continuous(minor_breaks = seq(2000,2016,1)) +\n labs(\n title = \"Percentage of Diagnosed Diabetes in Adults (18+) \\nSouth Region\"\n ,x = NULL\n ,y = NULL\n ,caption = \"Regions from US Census Bureau\"\n ) +\n theme()\n\ns_g2\n\n\n\n\n\nWhen focusing on the south region, North Carolina falls close to the middle of diabetes prevalence.\n\n\nOverall - North Carolina\nWhen examining the trajectory for North Carolina, we can see that it has been consistently higher than national average . We see that in 2016 there was a large spike in diagnosed cases; unfortunately this is the last available year so it is unclear whether the upward trend continues. The graph below compares state-level average to the national average. Notice that the trend line is slightly higher than in the previous graphs: this is due to the age cut offs used for National and State level data vs County Level data. Previous data used 18 years of age as a cutoff for classifying adults, whereas the county level data uses 20. Due to removing 18- and 19-year-olds from the population, who typically have less diagnosed cases of diabetes than those of older ages, the computed prevalence increases slightly.\n\n\nCode\nd1 <- nc_diabetes_data %>% \n group_by(year) %>% \n summarise(\n pct = mean(percentage)\n ,us_pct = mean(us_pct)\n ) %>% \n pivot_longer(\n cols = c(\"pct\", \"us_pct\")\n ,names_to = \"metric\"\n ,values_to = \"values\"\n ) %>% \n mutate(\n metric = factor(metric\n ,levels = c(\"pct\",\"us_pct\")\n ,labels = c(\"NC\", \"National\"))\n )\n\nnc_g1 <- d1 %>% \n ggplot(aes(x = year, y = values, color = metric)) +\n geom_line() +\n geom_point(shape = 21, size = 3) +\n geom_vline(xintercept = 2011, linetype = \"dashed\", color = \"gray\") +\n scale_y_continuous(labels = function(x) paste0(x, \"%\")) +\n scale_color_brewer(palette = \"Dark2\") +\n labs(\n x = NULL\n ,y = NULL\n ,color = NULL\n ,title = \"Percent of Adults (20+) with Diagnosed Diabetes\"\n )\n\nnc_g1 \n\n\n\n\n\nWe see a spike in 2016, the last year for which the data are available. However, we should be careful with our interpretation of this pattern, because the examination of the county-level trajectories reveals an aberration in the trend that requires a more rigorous investigation.\n\n\nCode\nnc_g1a <- nc_diabetes_data %>% \n ggplot(aes(x = year, y = percentage)) +\n geom_line(aes(group = county),alpha = 0.4) +\n labs(\n x = NULL\n ,y = NULL\n ,color = NULL\n )\n\nnc_g1a\n\n\n\n\n\nWhile all of North Carolina has a higher prevalence than the national average, rural counties have systematically higher prevalence of diabetes than urban counties. Note that after 2011 both Urban and Rural counties break the upward trend exhibited in the previous 5 years. This could be explained by the addition of cell phones to the BRFS Survey as many rural areas are often lower income areas and may only rely on a cell phone for communication. As mentioned previously there is an odd spike in case in 2016 that can’t be explained by current documentation. For the purpose of this evaluation 2016 will be excluded from the county level data since the odd trend can not be explained and no further data is available to determine if this is a real spike or could be attributed to methodology change or data quality.\n\n\nCode\nd2 <- nc_diabetes_data %>% \n select(-us_pct) %>% \n mutate(\n pct_rural = if_else(rural == TRUE, percentage, FALSE)\n ,pct_urban = if_else(rural == FALSE, percentage, FALSE)\n ) %>% \n select(-countyfips,-percentage) %>% \n group_by(year) %>% \n summarise(\n pct_rural = mean(pct_rural,na.rm = TRUE)\n ,pct_urban = mean(pct_urban,na.rm = TRUE)\n ) %>% left_join(us_diabetes_data) %>% \n pivot_longer(\n cols = c(\"us_pct\", \"pct_rural\",\"pct_urban\")\n ,names_to = \"metric\"\n ,values_to = \"value\"\n ,values_drop_na = TRUE\n ) %>% \n mutate(\n metric = factor(metric,\n levels = c(\"pct_rural\",\"pct_urban\",\"us_pct\")\n ,labels = c(\"Rural\",\"Urban\",\"US\")\n )\n )\n\nnc_g2 <- d2 %>% ggplot(aes(x = year, y = value, color = metric)) +\n geom_line() +\n geom_point(shape = 21, size = 3) +\n geom_vline(xintercept = 2011, linetype = \"dashed\", color = \"gray\") +\n scale_y_continuous(labels = function(x) paste0(x, \"%\")) +\n scale_color_brewer(palette = \"Dark2\") +\n labs(\n x = NULL\n ,y = NULL\n ,color = NULL\n ,title = \"Percent of Adults (20+) with Diagnosed Diabetes \\nDisplaying Rural vs Urban\"\n )\n\nnc_g2\n\n\n\n\n\n\n\nBy County - Geographical\nCounty level data first became available in 2004, three years of data is used to arrive at these estimates. For example, the 2006 estimates were computed using the data from 2005, 2006, and 2007 BRFS survey rounds. The county-level estimates were based on indirect model-dependent estimates using Bayesian multilevel modeling techniques(JNK, 2003 ; Barker et al., 2013). This model-dependent approach employs a statistical model that “borrows strength” in making an estimate for one county from BRFSS data collected in other counties and states. Multilevel Binomial regression models with random effects of demographic variables (age 20-44, 45-64, >=65; race/ethnicity; sex) at the county-level were developed. Estimates were adjusted for age to the 2000 US standard population using age groups of 20-44, 45-64, and 65 or older(Klein & Schoenborn, 2001).\n\n\nCode\ng50 <- nc_diabetes_data %>% \n filter(year < 2015) %>% \n mutate(\n rural = factor(rural\n ,levels = c(TRUE,FALSE)\n ,labels = c(\"Rural\", \"Urban\")\n )\n ) %>% \n ggplot(aes(x = year, y = percentage, color = rural)) +\n geom_line(aes(group = county),alpha = 0.3) +\n geom_smooth(aes(group = rural), method = \"loess\", se= FALSE, size = 1.1) +\n scale_color_brewer(palette = \"Dark2\") +\n labs(\n title = \"Percent of Adults (20+) with Diagnosed Diabetes \\nAll North Carolina Counties\"\n ,x = NULL\n ,y = NULL\n ,color = NULL\n )\n\ng50\n\n\n\n\n\nWhen viewing all county trend lines together, we see that the loess line for both urban and rural follows a similar trend for the time period.\nThe following graphs displays the total estimated prevalence of Diabetes in each off the 100 North Carolina counties. To keep the scaling consistent between the graphs, we binned the estimates into 6 intervals of the same size. Rural counties are highlighted with a stronger border line as well as a letter “R” in respective geographic centers. These graphs allow us to view geographical clusters of diabetes prevalence.\n\n\nCode\nnc_counties_map_binned <- nc_counties_map %>% \n filter(year < 2015) %>% \n mutate(\n bin = dlookr::binning(.$percentage, nbins = 6 ,type = \"equal\")\n ,bin = forcats::fct_recode(bin\n ,\"6.5 - 7.9\" = \"[6.5,7.97]\"\n ,\"8.0 - 9.4\" = \"(7.97,9.43]\" \n ,\"9.5 - 10.9\" = \"(9.43,10.9]\" \n ,\"11.0 - 12.4\" = \"(10.9,12.4]\"\n ,\"12.5 - 13.8\" = \"(12.4,13.8]\" \n ,\"13.9 - 15.3\" = \"(13.8,15.3]\"\n )\n )\n\nc_g1 <- nc_counties_map_binned %>% \n filter(year %in% c(2006,2014)) %>% \n ggplot() +\n geom_sf() + #blank geom_sf keeps gridlines from overlapping map\n geom_sf(aes(fill = bin,color = rural)) +\n geom_sf(data = nc_cities) +\n ggrepel::geom_text_repel(data = nc_cities, \n aes(x = long, y = lat, label = city)\n ,nudge_y = c(-1,1,1,-1,1)\n ,nudge_x = c(0,0,0,-1,0)\n ) +\n geom_text(data = . %>% filter(rural == TRUE)\n ,aes(x = center_long, y = center_lat)\n ,label = \"R\"\n ,color = \"#696969\"\n ) +\n coord_sf(xlim = c(-84.5,-75.5), ylim = c(33.75,37)) +\n facet_wrap(~year) +\n scale_fill_viridis_d(alpha = 0.6, direction = -1) +\n scale_color_manual(\n values = c(\n \"FALSE\" = \"gray\"\n ,\"TRUE\" = \"black\"\n ),guide = 'none') +\n labs(\n title = \"Estimated Diabetes in Adults (20+) by County\"\n ,fill = \"Percentage\"\n ,y = NULL\n ,x = NULL\n ) +\n theme(\n panel.background = element_rect(fill = \"aliceblue\")\n ,panel.grid.major = element_line(color = \"#D4D4D4\", linetype = \"dashed\", \n size = 0.5)\n ,legend.position = \"bottom\"\n ,plot.title = element_text(hjust = 0.5)\n )\n\nc_g1\n\n\n\n\n\nThe following box plot displays the distribution of estimated cases by county from 2006 to 2014. For all years of current data the mean of rural counties is higher then that of their Urban counterparts.\n\n\nCode\nc_g1c <- nc_counties_map %>% \n mutate(\n rural = factor(rural\n ,levels = c(TRUE,FALSE)\n ,labels = c(\"Rural\", \"Urban\")\n )) %>% \n filter(year < 2015) %>%\n ggplot(aes(x = year, y = percentage, group = interaction(year,rural), fill = rural)) +\n geom_boxplot(alpha = 0.5) +\n scale_fill_brewer(palette = \"Dark2\") +\n scale_x_continuous(breaks = seq(2004,2014,2)) +\n labs(\n x = NULL\n ,y = NULL\n ,fill = NULL\n ,title = \"Distribution of Estimated Cases by County 2006 - 2014\"\n )\n\nc_g1c\n\n\n\n\n\n\n\nBy County - Percent Change\nThe following graphs display the overall change in estimated prevalence between 2006 to 2014.\n\n\nCode\nd3 <- nc_counties_map %>% \n st_drop_geometry() %>% \n filter(year %in% c(2006,2014)) %>% \n select(-countyfips,-us_pct) %>% \n pivot_wider(names_from = \"year\"\n ,values_from = \"percentage\") %>% \n mutate(\n pct_p = `2014` - `2006`\n ,pct_c = ((`2014` - `2006`)/`2006`) * 100\n ) %>% \n left_join(nc_counties_map_raw) %>% \n st_as_sf()\n\n\nc_g4 <- d3 %>% \n ggplot() +\n geom_sf() + #blank geom_sf keeps gridlines from overlapping map\n geom_sf(aes(fill = pct_c ,color = rural)) +\n geom_sf(data = nc_cities) +\n ggrepel::geom_text_repel(data = nc_cities, \n aes(x = long, y = lat, label = city)\n ,nudge_y = c(-1,1,1,-1,1)\n ,nudge_x = c(0,0,0,-1,0)\n ) +\n geom_text(data = . %>% filter(rural == TRUE)\n ,aes(x = center_long, y = center_lat)\n ,label = \"R\"\n ,color = \"#696969\"\n ) +\n # scale_fill_viridis_c(alpha = 0.6, direction = -1) +\n scale_fill_gradient2(\n low = \"#d01c8b\"\n ,mid = \"#f7f7f7\"\n ,high = \"#4dac26\"\n ,midpoint = 0\n ) +\n scale_color_manual(\n values = c(\n \"FALSE\" = \"gray\"\n ,\"TRUE\" = \"black\"\n ),guide = 'none') +\n labs(\n title = \"Percentage Change of Diagnosed Diabetes 2006-2014\"\n ,fill = \"Percentage\"\n ,y = NULL\n ,x = NULL\n ) +\n theme(\n panel.background = element_rect(fill = \"aliceblue\")\n ,panel.grid.major = element_line(color = \"#D4D4D4\", linetype = \"dashed\", \n size = 0.5)\n )\n\nc_g4\n\n\n\n\n\nThe following chart displays the density curve of the percentage change for both rural and urban counties. It is notable that the mean of change for Urban counties is actually higher than the mean for rural counties. However, we also see that most change for both regions is positive growth. In fact only 16 rural, and 10 Urban counties experienced negative change in the given time frame. While 35 rural and 34 urban counties experience growth in the same period.\n\n\nCode\nd4 <- d3 %>% \n st_drop_geometry() %>% \n mutate(\n rural = factor(rural\n ,levels = c(TRUE,FALSE)\n ,labels = c(\"Rural\", \"Urban\")\n )\n )\n\n\nmean_d4 <- d4 %>% \n group_by(rural) %>% \n summarise(.groups = \"keep\"\n ,pct_c = mean(pct_c)\n )\n\ng51 <- d4 %>% \n ggplot(aes(x = pct_c, fill = rural, y = ..density.., color = rural)) +\n geom_histogram(binwidth = 5, position = \"identity\", alpha = 0.3) +\n geom_density(alpha = 0.5) +\n facet_wrap(~rural, ncol = 1) +\n geom_vline(aes(xintercept = pct_c), data = mean_d4) +\n geom_text(aes(x = pct_c, y = 0.038, label = round(pct_c, 2))\n ,data = mean_d4\n ,hjust = -0.15\n ,size = 5\n ,color = \"#000000\") +\n geom_vline(xintercept = 0, linetype = \"dashed\", color = \"#696969\") +\n scale_color_brewer(palette = \"Dark2\", guide = NULL) +\n scale_fill_brewer(palette = \"Dark2\", guide = NULL) +\n labs(\n x = \"Percentage Change\"\n ,y = \"Density\"\n ,fill = NULL\n )\ng51\n\n\n\n\n\n\n\nConclusion and Next Steps\nThe original hypothesis of this report was that rural counties were growing at a higher rate then there urban counterparts. Through out this post it has been shown that this hypothesis is incorrect, just being a rural county does not indicate diabetes growth, in fact the growth rate throughout North Carolina has been consistent. Further posts will explore other reasons for these trends, as the current post merely explores the trends and differences using data visualizations, a more rigorous and formal evaluation of these comparison is in order.\n\n\nSession information\n===========================================================================\nFor the sake of documentation and reproducibility, the current report was rendered in the following environment. Click the line below to expand.\n\n\nEnvironment \n\n\n\nCode\nif( requireNamespace(\"devtools\", quietly = TRUE) ) {\n devtools::session_info()\n} else {\n sessionInfo()\n} \n\n\n─ Session info ───────────────────────────────────────────────────────────────\n setting value\n version R version 4.2.1 (2022-06-23 ucrt)\n os Windows 10 x64 (build 22621)\n system x86_64, mingw32\n ui RTerm\n language (EN)\n collate English_United States.utf8\n ctype English_United States.utf8\n tz America/New_York\n date 2023-10-12\n pandoc 3.1.8 @ C:/PROGRA~1/Pandoc/ (via rmarkdown)\n\n─ Packages ───────────────────────────────────────────────────────────────────\n package * version date (UTC) lib source\n bit 4.0.4 2020-08-04 [1] CRAN (R 4.2.2)\n bit64 4.0.5 2020-08-30 [1] CRAN (R 4.2.2)\n cachem 1.0.6 2021-08-19 [1] CRAN (R 4.2.2)\n callr 3.7.3 2022-11-02 [1] CRAN (R 4.2.2)\n class 7.3-20 2022-01-16 [2] CRAN (R 4.2.1)\n classInt 0.4-10 2023-09-05 [1] CRAN (R 4.2.3)\n cli 3.4.1 2022-09-23 [1] CRAN (R 4.2.2)\n colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.2)\n confintr 1.0.2 2023-06-04 [1] CRAN (R 4.2.3)\n crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.2)\n crul 1.4.0 2023-05-17 [1] CRAN (R 4.2.3)\n curl 4.3.3 2022-10-06 [1] CRAN (R 4.2.2)\n DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.2)\n devtools 2.4.5 2022-10-11 [1] CRAN (R 4.2.2)\n digest 0.6.30 2022-10-18 [1] CRAN (R 4.2.2)\n dlookr 0.6.2 2023-07-01 [1] CRAN (R 4.2.3)\n dplyr * 1.1.3 2023-09-03 [1] CRAN (R 4.2.3)\n e1071 1.7-13 2023-02-01 [1] CRAN (R 4.2.3)\n ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.2)\n evaluate 0.21 2023-05-05 [1] CRAN (R 4.2.3)\n extrafont 0.19 2023-01-18 [1] CRAN (R 4.2.2)\n extrafontdb 1.0 2012-06-11 [1] CRAN (R 4.2.0)\n fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.2)\n farver 2.1.1 2022-07-06 [1] CRAN (R 4.2.2)\n fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.2)\n fontBitstreamVera 0.1.1 2017-02-01 [1] CRAN (R 4.2.0)\n fontLiberation 0.1.0 2016-10-15 [1] CRAN (R 4.2.0)\n fontquiver 0.2.1 2017-02-01 [1] CRAN (R 4.2.3)\n forcats 1.0.0 2023-01-29 [1] CRAN (R 4.2.3)\n Formula 1.2-5 2023-02-24 [1] CRAN (R 4.2.2)\n fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.2)\n gdtools 0.3.3 2023-03-27 [1] CRAN (R 4.2.3)\n generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.2)\n gfonts 0.2.0 2023-01-08 [1] CRAN (R 4.2.3)\n gghighlight 0.4.0 2022-10-16 [1] CRAN (R 4.2.3)\n ggplot2 * 3.4.2 2023-04-03 [1] CRAN (R 4.2.3)\n ggpmisc 0.5.4-1 2023-08-13 [1] CRAN (R 4.2.3)\n ggpp 0.5.4 2023-08-12 [1] CRAN (R 4.2.3)\n ggrepel 0.9.3 2023-02-03 [1] CRAN (R 4.2.3)\n glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.2)\n gridExtra 2.3 2017-09-09 [1] CRAN (R 4.2.2)\n gtable 0.3.3 2023-03-21 [1] CRAN (R 4.2.3)\n hms 1.1.3 2023-03-21 [1] CRAN (R 4.2.3)\n hrbrthemes 0.8.0 2020-03-06 [1] CRAN (R 4.2.3)\n htmltools 0.5.4 2022-12-07 [1] CRAN (R 4.2.2)\n htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.2.3)\n httpcode 0.3.0 2020-04-10 [1] CRAN (R 4.2.3)\n httpuv 1.6.8 2023-01-12 [1] CRAN (R 4.2.2)\n httr 1.4.6 2023-05-08 [1] CRAN (R 4.2.3)\n inum 1.0-5 2023-03-09 [1] CRAN (R 4.2.3)\n jsonlite 1.8.3 2022-10-21 [1] CRAN (R 4.2.2)\n kableExtra 1.3.4 2021-02-20 [1] CRAN (R 4.2.2)\n KernSmooth 2.23-20 2021-05-03 [2] CRAN (R 4.2.1)\n knitr 1.43 2023-05-25 [1] CRAN (R 4.2.3)\n labeling 0.4.2 2020-10-20 [1] CRAN (R 4.2.0)\n later 1.3.0 2021-08-18 [1] CRAN (R 4.2.2)\n lattice 0.20-45 2021-09-22 [2] CRAN (R 4.2.1)\n libcoin 1.0-10 2023-09-27 [1] CRAN (R 4.2.3)\n lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.2)\n magrittr * 2.0.3 2022-03-30 [1] CRAN (R 4.2.2)\n mapdata * 2.3.1 2022-11-01 [1] CRAN (R 4.2.3)\n maps * 3.4.1 2022-10-30 [1] CRAN (R 4.2.3)\n MASS 7.3-60 2023-05-04 [1] CRAN (R 4.2.3)\n Matrix 1.5-4.1 2023-05-18 [1] CRAN (R 4.2.3)\n MatrixModels 0.5-1 2022-09-11 [1] CRAN (R 4.2.3)\n memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.2)\n mgcv 1.8-40 2022-03-29 [2] CRAN (R 4.2.1)\n mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)\n miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.2)\n munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.2)\n mvtnorm 1.2-2 2023-06-08 [1] CRAN (R 4.2.3)\n nlme 3.1-157 2022-03-25 [2] CRAN (R 4.2.1)\n pagedown 0.20 2022-12-13 [1] CRAN (R 4.2.3)\n partykit 1.2-20 2023-04-14 [1] CRAN (R 4.2.3)\n pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.3)\n pkgbuild 1.4.2 2023-06-26 [1] CRAN (R 4.2.1)\n pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.2)\n pkgload 1.3.2 2022-11-16 [1] CRAN (R 4.2.2)\n polynom 1.4-1 2022-04-11 [1] CRAN (R 4.2.3)\n prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.2.2)\n processx 3.8.1 2023-04-18 [1] CRAN (R 4.2.3)\n profvis 0.3.8 2023-05-02 [1] CRAN (R 4.2.3)\n promises 1.2.0.1 2021-02-11 [1] CRAN (R 4.2.2)\n proxy 0.4-27 2022-06-09 [1] CRAN (R 4.2.3)\n ps 1.7.5 2023-04-18 [1] CRAN (R 4.2.3)\n purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.3)\n quantreg 5.95 2023-04-08 [1] CRAN (R 4.2.3)\n R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.2)\n RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.2.0)\n Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.2.2)\n reactable 0.4.4 2023-03-12 [1] CRAN (R 4.2.3)\n readr * 2.1.3 2022-10-01 [1] CRAN (R 4.2.2)\n remotes 2.4.2 2021-11-30 [1] CRAN (R 4.2.2)\n rlang 1.1.0 2023-03-14 [1] CRAN (R 4.2.3)\n rmarkdown 2.22 2023-06-01 [1] CRAN (R 4.2.3)\n rpart 4.1.16 2022-01-24 [2] CRAN (R 4.2.1)\n rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.2.2)\n Rttf2pt1 1.3.12 2023-01-22 [1] CRAN (R 4.2.2)\n rvest 1.0.3 2022-08-19 [1] CRAN (R 4.2.3)\n scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.2)\n sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.2)\n sf * 1.0-14 2023-07-11 [1] CRAN (R 4.2.3)\n shiny 1.7.4 2022-12-15 [1] CRAN (R 4.2.1)\n showtext 0.9-6 2023-05-03 [1] CRAN (R 4.2.3)\n showtextdb 3.0 2020-06-04 [1] CRAN (R 4.2.3)\n snakecase 0.11.0 2019-05-25 [1] CRAN (R 4.2.2)\n SparseM 1.81 2021-02-18 [1] CRAN (R 4.2.0)\n stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.1)\n stringr 1.5.0 2022-12-02 [1] CRAN (R 4.2.3)\n survival 3.3-1 2022-03-03 [2] CRAN (R 4.2.1)\n svglite 2.1.1 2023-01-10 [1] CRAN (R 4.2.2)\n sysfonts 0.8.8 2022-03-13 [1] CRAN (R 4.2.3)\n systemfonts 1.0.4 2022-02-11 [1] CRAN (R 4.2.2)\n tibble 3.2.1 2023-03-20 [1] CRAN (R 4.2.3)\n tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.2.3)\n tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.2)\n tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.2)\n units 0.8-4 2023-09-13 [1] CRAN (R 4.2.3)\n urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.2)\n usethis 2.2.1 2023-06-23 [1] CRAN (R 4.2.3)\n utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.2)\n vctrs 0.6.2 2023-04-19 [1] CRAN (R 4.2.3)\n viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.2.3)\n vroom 1.6.0 2022-09-30 [1] CRAN (R 4.2.2)\n webshot 0.5.4 2022-09-26 [1] CRAN (R 4.2.2)\n withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.2)\n xfun 0.39 2023-04-20 [1] CRAN (R 4.2.3)\n xml2 1.3.5 2023-07-06 [1] CRAN (R 4.2.3)\n xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.2)\n yaml 2.3.6 2022-10-18 [1] CRAN (R 4.2.1)\n\n [1] C:/Users/belangew/AppData/Local/R/win-library/4.2\n [2] C:/Program Files/R/R-4.2.1/library\n\n──────────────────────────────────────────────────────────────────────────────\n\n\n\n\n\n\n\n\n\n\n\nReferences\n\nAmerican Diabetes Asssociation. (2015). The burden of diabetes in north carolina. http://main.diabetes.org/dorg/PDFs/Advocacy/burden-of-diabetes/north-carolina.pdf\n\n\nBarker, L. E., Thompson, T. J., Kirtland, K. A., Boyle, J. P., Geiss, L. S., McCauley, M. M., & Albright, A. L. (2013). Bayesian small area estimates of diabetes incidence by united states county, 2009. Journal of Data Science, 11(1), 269–280. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4537395/\n\n\nCenters for Disease Control and Prevention. (2020). National diabetes statistics report. US Department of Health and Human Services. https://www.cdc.gov/diabetes/pdfs/data/statistics/national-diabetes-statistics-report.pdf\n\n\nJNK, R. (2003). Small area estimation. https://onlinelibrary.wiley.com/doi/pdf/10.1002/0471722189.fmatter\n\n\nKlein, R. J., & Schoenborn, C. A. (2001). Age adjustment using the 2000 projected u.s. population. Healthy People 2000 Stat Notes, 20, 1–9.\n\n\nPierannunzi, C., Town, M., Garvin, W., Shaw, F. E., & Balluz, L. (2012). Methodologic changes in the behavioral risk factor surveillance system in 2011 and potential effects on prevalence estimates. Morbidity and Mortality Weekly Report, 61(22), 410–413. https://www.cdc.gov/mmwr/pdf/wk/mm6122.pdf\n\nReusehttps://creativecommons.org/licenses/by/4.0/CitationBibTeX citation:@online{belanger2020,\n author = {Belanger, Kyle},\n title = {Diabetes in {Rural} {North} {Carolina} : {Exploring}\n {Prevalence} {Trends}},\n date = {2020-06-25},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, K. (2020, June 25). Diabetes in Rural North Carolina :\nExploring Prevalence Trends." + }, + { + "objectID": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html", + "href": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html", + "title": "Basic Exploration of WHO Tuberculosis Data", + "section": "", + "text": "Today I am going to dive into some real life data from the World Health Organization (WHO), exploring new and relapse cases of Tuberculosis. I clean up the data, and then make a few graphs to explore different variables." + }, + { + "objectID": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#a-different-way-to-look", + "href": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#a-different-way-to-look", + "title": "Basic Exploration of WHO Tuberculosis Data", + "section": "A different way to look", + "text": "A different way to look\nCould there be any correlation between a countries population and the amount of TB cases? Maybe its just as simple as having more people means more people to get sick? Lets bring in another data set, again from World Bank Found Here, this contains total population data by country.\n\npop_raw <- read.csv(\"API_SP.POP.TOTL_DS2_en_csv_v2_713131.csv\"\n ,skip = 4)\n#If this looks famialer its because it is, the data set looks very simalar to the GDP data\n#In the future this could be moved to a function to allow cleaning much easier\npop1 <- pop_raw %>% \n select(-(Indicator.Name:X2012)\n ,-X2019\n ,-X) %>% \n pivot_longer(cols = X2013:X2018\n ,names_to = \"year\" \n ,values_to = \"population\") %>% \n mutate_if(is.character\n ,str_remove_all\n ,pattern = \"X(?=\\\\d*)\")\n\n#now lets combine this into are overall data set\n\nwho_combined <- who_combined %>% \n mutate(year = as.character(year)) %>% \n left_join(y = pop1) %>% \n select(-Country.Name)\n\n#now lets Graph again\n\ng3 <- who_combined %>% \n filter(str_detect(age,\"014|15plus|u\"),year == 2018) %>% \n group_by(country) %>% \n summarise(sum_tb_cases = (sum(values,na.rm = TRUE)/10000)\n ,population = first(population)/1000000\n ,who_region = first(g_whoregion)) %>% \n mutate(\n label = ifelse((population>250), yes = as.character(country),no = \"\")) %>%\n ggplot(aes(x = population, y = sum_tb_cases )) +\n geom_point(aes(color = who_region)) +\n ggrepel::geom_text_repel(aes(x = population, y = sum_tb_cases, label = label)) +\n labs(\n title = \"Total TB Cases by Country compared to Gross Domestic Product (GDP)\"\n ,x = \"Population (in Millions)\"\n ,y = \"Total TB Case (per 10,000 cases)\"\n ,color = \"WHO Region\"\n ) +\n theme_bw() \n\n g3 \n\n\n\n\n\nFurther Exploration\nMaybe we are on to something, the more people, the more likely they are to get sick! However India seems to have a very large number of cases so lets break these cases down further by age group for 2018.\n\ng4 <- who_combined %>% \n filter(year == 2018\n ,country == \"India\"\n ,!(str_detect(age,\"15plus|ageunk|u|014\"))\n ,(str_detect(sex,\"m|f\"))\n ) %>% \n mutate(age_range = glue::glue(\"{age_start} -- {age_end}\")) %>% \n ggplot(aes(x = reorder(age_range, as.numeric(age_start)), y = (values/1000), fill = sex)) +\n geom_col(position = \"dodge\") +\n labs(\n title = \"TB Case in India by age and gender 2018\"\n ,x = NULL\n ,y = \"Total Cases (per 1000)\"\n ,fill = \"Gender\") +\n scale_fill_manual(labels = c(\"Female\",\"Male\"), values = c(\"#e9a3c9\",\"#67a9cf\") )\n \ng4\n\n\n\n\nThere seems to be a huge spike in cases after adolescences. Females have a sharp decline the older they get, where as male case stay elevated with a slight decrease at 55." + }, + { + "objectID": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#last-exploration", + "href": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#last-exploration", + "title": "Basic Exploration of WHO Tuberculosis Data", + "section": "Last Exploration", + "text": "Last Exploration\nLets look at overall cases in India, going back to 1980 and see if there as been any trends. To get these numbers we will go back to our raw data and strip everything out expect the total count\n\ng5 <- who_raw %>% \n filter(country == \"India\") %>% \n select(year, c_newinc) %>% \n ggplot(aes(x = year, y = c_newinc/1000000)) +\n geom_line() +\n geom_point() +\n labs(\n title = \"New and Relapse Tuberculosis Cases In India \\n1980 -- 2018\"\n ,x = NULL\n ,y = \"Total Cases (in millions)\") +\n theme_bw() +\n theme(plot.title = element_text(hjust = 0.5)) + #center title \n scale_x_continuous(breaks = seq(1980,2020,5)) +\n scale_y_continuous(breaks = scales::pretty_breaks(n=10)) #different way to add tick marks\ng5\n\n\n\n\nCases were steadily rising from 1980 to 1990, then suddenly feel off. Starting in the early 2010s there was a sharp increase and the amount of new and relapse cases just keep growing." + }, + { + "objectID": "posts/2020-01-29_facets-and-humility/facets-and-a-lesson-in-humility.html", + "href": "posts/2020-01-29_facets-and-humility/facets-and-a-lesson-in-humility.html", + "title": "Facets and a Lesson in Humility", + "section": "", + "text": "Todays post is a lesson in Facets, as well as humility. The task this week was to replicate the graph in Chapter 8 of Tableau for Healthcare in R. The graph in question is called a Table Lens (This is the name the book uses, however I did have trouble finding this name in Google searches), it is a collection of charts with a common theme, this time looking at countries in various WHO regions and some statistics associated with mortality as well as health expenditure. I say this is a lesson in humiltiy as I have read through the excellent book R for Data Science, and yet the idea of faceting a ggplot graph slipped my mind. This ended with hours of trying to find a package in R to line up graphs, and way more time then I care to admit spent on getting things prefect. I did find such a package called cowplots, which can be found here. While this is an excellent package, its use was unecessary and I reverted back to using the excellent facet feature of GGplot, which can be seen below! \n\nLoad Libraries\n\nlibrary(magrittr) #pipes\nlibrary(ggplot2) #ploting \nlibrary(dplyr)\nlibrary(tidyr)\n\n\n\nImport Data\n\nds <- readxl::read_xlsx(path = \"../2020-01-04_my-start-to-r/Tableau 10 Training Practice Data.xlsx\"\n ,sheet = \"03 - WHO Life Expect & Mort\"\n )\n\n\n\nClean Names and Transform\n\nvarnames <- c(\"who_region\", \"country\", \"year\" , \"sex\" , \"life_expect_birth\" , \"neo_mort\"\n ,\"under_five_mort\" , \"health_expenditure\")\nnames(ds) <- varnames\n\n# Order Countries based on Life Expectancy at Birth\n\nds$country <- factor(ds$country, levels = ds$country[order(ds$life_expect_birth)]) \n\n#To \"Long\" Form\n\nds1 <- ds %>% pivot_longer(5:8)#select columns 5 throuh 8, leave new columns at default names\n\n# Set up labels for Facet, as well as function for Facet Labeller\n\nfacet_labels <- list(\n\"life_expect_birth\" = \"Life Expectancy at Birth \" \n,\"neo_mort\" = \"Neonatal Mortality Rate\" \n,\"under_five_mort\" = \"Under-Five Mortality Rate\"\n,\"health_expenditure\" = \"Health Expenditure per Capita (US$)\" )\n\nvariable_labeller <- function(variable,value){\n return(facet_labels[value])\n}\n\n\n\nGraphs\n\nhightlight_countries <- (c(\"Mauritania\", \"South Africa\")) \n\ng1 <- ds1 %>% filter(who_region == \"Africa\") %>% \n mutate(name = factor(name, levels = c(\"life_expect_birth\" , \"neo_mort\"\n ,\"under_five_mort\" , \"health_expenditure\"))\n ,highlight = country %in% hightlight_countries) %>% \n ggplot(aes(x = country, y = value, fill = highlight)) +\n geom_col(show.legend = FALSE) +\n coord_flip() +\n labs(\n title = \"World Bank Life Expectancy, Neonatal & Under-Five Mortality Rates, and Health Expenditure Analysis\"\n ,x = NULL\n ,y = NULL\n ) +\n facet_grid(~name, scales = \"free_x\",labeller = variable_labeller) +\n theme_bw() +\n geom_text(aes(label = round(value, 0)), hjust = 0) +\n scale_y_continuous(expand = expand_scale(mult = c(0,0.2))) +\n scale_fill_manual(values = c(\"TRUE\" = \"#fc8d59\", \"FALSE\" = \"#2b83ba\"))\ng1\n\n\n\n\n\n\n\n\nReusehttps://creativecommons.org/licenses/by/4.0/CitationBibTeX citation:@online{belanger2020,\n author = {Belanger, Kyle},\n title = {Facets and a {Lesson} in {Humility}},\n date = {2020-01-29},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2020. “Facets and a Lesson in Humility.”\nJanuary 29, 2020." }, { "objectID": "index.html", @@ -39,49 +95,14 @@ "href": "blog.html", "title": "Posts", "section": "", - "text": "TidyTuesday 2021 Week 6: HBCU Enrollment\n\n\nTidyTuesday 2021 Week 6: HBCU Enrollment. Posts looks at tidying the data ,as well as making some graphs about the data.\n\n\n\n\nTidyTuesday\n\n\n\n\n\n\n\n\n\n\n\nFeb 26, 2021\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nConverting From Blogdown to Distill\n\n\nA meta post on transferring from a blogdown to distill blog site\n\n\n\n\nDistill\n\n\n\n\n\n\n\n\n\n\n\nJan 12, 2021\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nDiabetes in Rural North Carolina : Data Collection and Cleaning\n\n\nThis is the second post in the series exploring Diabetes in rural North Carolina. This post will explore the data used for this project, from collection, cleaning, and analysis ready data.\n\n\n\n\n\n\n\n\n\nJul 25, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nImporting Excel Data with Multiple Header Rows\n\n\nA solution for importing Excel Data that contains two header rows.\n\n\n\n\n\n\n\n\n\nJun 22, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nBasic Exploration of WHO Tuberculosis Data\n\n\nToday I am going to dive into some real life data from the World Health Organization (WHO), exploring new and relapse cases of Tuberculosis. I clean up the data, and then make a few graphs to explore different variables.\n\n\n\n\n\n\n\n\n\nFeb 13, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nLine Graphs and Interactivity\n\n\nTableau for Healthcare Chapter 10. Static and Interactive examples\n\n\n\n\n\n\n\n\n\nFeb 10, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nFacets and a Lesson in Humility\n\n\nA look at Tableau for Healthcare Chapter 8. Table Lens graph.\n\n\n\n\n\n\n\n\n\nJan 29, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nMy Start to R\n\n\nA short introduction to my blog, and R journey.\n\n\n\n\n\n\n\n\n\nJan 24, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\nNo matching items" - }, - { - "objectID": "posts/post-with-code/index.html", - "href": "posts/post-with-code/index.html", - "title": "Post With Code", - "section": "", - "text": "This is a post with executable code." + "text": "TidyTuesday 2021 Week 6: HBCU Enrollment\n\n\nTidyTuesday 2021 Week 6: HBCU Enrollment. Posts looks at tidying the data ,as well as making some graphs about the data.\n\n\n\n\nTidyTuesday\n\n\n\n\n\n\n\n\n\n\n\nFeb 26, 2021\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nConverting From Blogdown to Distill\n\n\nA meta post on transferring from a blogdown to distill blog site\n\n\n\n\nDistill\n\n\n\n\n\n\n\n\n\n\n\nJan 12, 2021\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nDiabetes in Rural North Carolina : Data Collection and Cleaning\n\n\nThis is the second post in the series exploring Diabetes in rural North Carolina. This post will explore the data used for this project, from collection, cleaning, and analysis ready data.\n\n\n\n\n\n\n\n\n\nJul 25, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nDiabetes in Rural North Carolina : Exploring Prevalence Trends\n\n\nThis post introduces the exploration of the Diabetes epidemic in North Carolina\n\n\n\n\n\n\n\n\n\nJun 25, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nImporting Excel Data with Multiple Header Rows\n\n\nA solution for importing Excel Data that contains two header rows.\n\n\n\n\n\n\n\n\n\nJun 22, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nBasic Exploration of WHO Tuberculosis Data\n\n\nToday I am going to dive into some real life data from the World Health Organization (WHO), exploring new and relapse cases of Tuberculosis. I clean up the data, and then make a few graphs to explore different variables.\n\n\n\n\n\n\n\n\n\nFeb 13, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nLine Graphs and Interactivity\n\n\nTableau for Healthcare Chapter 10. Static and Interactive examples\n\n\n\n\n\n\n\n\n\nFeb 10, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nFacets and a Lesson in Humility\n\n\nA look at Tableau for Healthcare Chapter 8. Table Lens graph.\n\n\n\n\n\n\n\n\n\nJan 29, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\n \n\n\n\n\nMy Start to R\n\n\nA short introduction to my blog, and R journey.\n\n\n\n\n\n\n\n\n\nJan 24, 2020\n\n\nKyle Belanger\n\n\n\n\n\n\nNo matching items" }, { "objectID": "posts/2020-01-04_my-start-to-r/my-start-to-r.html", "href": "posts/2020-01-04_my-start-to-r/my-start-to-r.html", "title": "My Start to R", "section": "", - "text": "Today starts my attempt at sharing my R journey with the world! I have been learning R off and on now since late 2019, I have begun to take it much more serious as I work through my Data Analytics class at UCF. My love for all things numbers and graphs has really blossomed, and I am choosing to share that love with anyone who cares to read. I will not claim to be the best at R, or any programming for that matter, but these are my attempts. Each post in this serious will be replicated a graph created in Tableau from the book Tableau for Healthcare. Todays graph is a simple horizontal bar chart, in transferring to both a new blog site and computer I have unfortunately lost the original bar graph, but trust me the one I created looks just like it.\n\nLoad Libraries\n\nlibrary(tidyr)\nlibrary(magrittr)\nlibrary(ggplot2)\nlibrary(stringr)\nlibrary(dplyr)\n\n\n\nImport Data\n\nds <- readxl::read_excel(\n path = \"Tableau 10 Training Practice Data.xlsx\" \n ,sheet = \"02 - Patient Falls-Single Hosp\"\n )\n\n\n\nClean Data Names\n\n#should make reusable forumla at later time\nnames(ds) <- tolower(names(ds))\nnames(ds) <- str_replace_all(names(ds),\" \", \"_\")\n\n\n\nConvert Data to ‘Long Form’\n\nds1 <- ds %>% \n gather(\"patient_falls_no_injury_rate\" , \"patient_falls_with_injury_rate\"\n ,key = \"injury\" \n ,value = \"rate\" ) %>% \n mutate(injury = (injury == \"patient_falls_with_injury_rate\"))\n\n\n\nGraph 5.1\n\nb1 <- ds %>% \n ggplot(mapping = aes(x = reorder(type_of_care,total_patient_falls_rate ) , y = total_patient_falls_rate)) +\n geom_col(fill = \"#2b83ba\") + \n coord_flip() +\n scale_y_continuous(breaks = NULL) +\n theme(axis.ticks = element_blank()) +\n labs(title = \"Rate of Patient Falls (per 1,000 Pateint Days)\\nby Type of Care for FY2017\"\n ,x = NULL\n ,y = NULL\n ) +\n theme_classic() +\n geom_text(aes(label = format(total_patient_falls_rate, digits = 2)), nudge_y = -.25, color = \"white\")\n \nb1\n\n\n\n\n\n\n\n\nCitationBibTeX citation:@online{belanger2020,\n author = {Belanger, Kyle},\n title = {My {Start} to {R}},\n date = {2020-01-24},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2020. “My Start to R.” January 24, 2020." - }, - { - "objectID": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html", - "href": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html", - "title": "Basic Exploration of WHO Tuberculosis Data", - "section": "", - "text": "Today I am going to dive into some real life data from the World Health Organization (WHO), exploring new and relapse cases of Tuberculosis. I clean up the data, and then make a few graphs to explore different variables." - }, - { - "objectID": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#a-different-way-to-look", - "href": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#a-different-way-to-look", - "title": "Basic Exploration of WHO Tuberculosis Data", - "section": "A different way to look", - "text": "A different way to look\nCould there be any correlation between a countries population and the amount of TB cases? Maybe its just as simple as having more people means more people to get sick? Lets bring in another data set, again from World Bank Found Here, this contains total population data by country.\n\npop_raw <- read.csv(\"API_SP.POP.TOTL_DS2_en_csv_v2_713131.csv\"\n ,skip = 4)\n#If this looks famialer its because it is, the data set looks very simalar to the GDP data\n#In the future this could be moved to a function to allow cleaning much easier\npop1 <- pop_raw %>% \n select(-(Indicator.Name:X2012)\n ,-X2019\n ,-X) %>% \n pivot_longer(cols = X2013:X2018\n ,names_to = \"year\" \n ,values_to = \"population\") %>% \n mutate_if(is.character\n ,str_remove_all\n ,pattern = \"X(?=\\\\d*)\")\n\n#now lets combine this into are overall data set\n\nwho_combined <- who_combined %>% \n mutate(year = as.character(year)) %>% \n left_join(y = pop1) %>% \n select(-Country.Name)\n\n#now lets Graph again\n\ng3 <- who_combined %>% \n filter(str_detect(age,\"014|15plus|u\"),year == 2018) %>% \n group_by(country) %>% \n summarise(sum_tb_cases = (sum(values,na.rm = TRUE)/10000)\n ,population = first(population)/1000000\n ,who_region = first(g_whoregion)) %>% \n mutate(\n label = ifelse((population>250), yes = as.character(country),no = \"\")) %>%\n ggplot(aes(x = population, y = sum_tb_cases )) +\n geom_point(aes(color = who_region)) +\n ggrepel::geom_text_repel(aes(x = population, y = sum_tb_cases, label = label)) +\n labs(\n title = \"Total TB Cases by Country compared to Gross Domestic Product (GDP)\"\n ,x = \"Population (in Millions)\"\n ,y = \"Total TB Case (per 10,000 cases)\"\n ,color = \"WHO Region\"\n ) +\n theme_bw() \n\n g3 \n\n\n\n\n\nFurther Exploration\nMaybe we are on to something, the more people, the more likely they are to get sick! However India seems to have a very large number of cases so lets break these cases down further by age group for 2018.\n\ng4 <- who_combined %>% \n filter(year == 2018\n ,country == \"India\"\n ,!(str_detect(age,\"15plus|ageunk|u|014\"))\n ,(str_detect(sex,\"m|f\"))\n ) %>% \n mutate(age_range = glue::glue(\"{age_start} -- {age_end}\")) %>% \n ggplot(aes(x = reorder(age_range, as.numeric(age_start)), y = (values/1000), fill = sex)) +\n geom_col(position = \"dodge\") +\n labs(\n title = \"TB Case in India by age and gender 2018\"\n ,x = NULL\n ,y = \"Total Cases (per 1000)\"\n ,fill = \"Gender\") +\n scale_fill_manual(labels = c(\"Female\",\"Male\"), values = c(\"#e9a3c9\",\"#67a9cf\") )\n \ng4\n\n\n\n\nThere seems to be a huge spike in cases after adolescences. Females have a sharp decline the older they get, where as male case stay elevated with a slight decrease at 55." - }, - { - "objectID": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#last-exploration", - "href": "posts/2020-02-13_basic-who-TB-data/basic-exploration-of-who-tuberculosis-data.html#last-exploration", - "title": "Basic Exploration of WHO Tuberculosis Data", - "section": "Last Exploration", - "text": "Last Exploration\nLets look at overall cases in India, going back to 1980 and see if there as been any trends. To get these numbers we will go back to our raw data and strip everything out expect the total count\n\ng5 <- who_raw %>% \n filter(country == \"India\") %>% \n select(year, c_newinc) %>% \n ggplot(aes(x = year, y = c_newinc/1000000)) +\n geom_line() +\n geom_point() +\n labs(\n title = \"New and Relapse Tuberculosis Cases In India \\n1980 -- 2018\"\n ,x = NULL\n ,y = \"Total Cases (in millions)\") +\n theme_bw() +\n theme(plot.title = element_text(hjust = 0.5)) + #center title \n scale_x_continuous(breaks = seq(1980,2020,5)) +\n scale_y_continuous(breaks = scales::pretty_breaks(n=10)) #different way to add tick marks\ng5\n\n\n\n\nCases were steadily rising from 1980 to 1990, then suddenly feel off. Starting in the early 2010s there was a sharp increase and the amount of new and relapse cases just keep growing." - }, - { - "objectID": "posts/2020-01-29_facets-and-humility/facets-and-a-lesson-in-humility.html", - "href": "posts/2020-01-29_facets-and-humility/facets-and-a-lesson-in-humility.html", - "title": "Facets and a Lesson in Humility", - "section": "", - "text": "Todays post is a lesson in Facets, as well as humility. The task this week was to replicate the graph in Chapter 8 of Tableau for Healthcare in R. The graph in question is called a Table Lens (This is the name the book uses, however I did have trouble finding this name in Google searches), it is a collection of charts with a common theme, this time looking at countries in various WHO regions and some statistics associated with mortality as well as health expenditure. I say this is a lesson in humiltiy as I have read through the excellent book R for Data Science, and yet the idea of faceting a ggplot graph slipped my mind. This ended with hours of trying to find a package in R to line up graphs, and way more time then I care to admit spent on getting things prefect. I did find such a package called cowplots, which can be found here. While this is an excellent package, its use was unecessary and I reverted back to using the excellent facet feature of GGplot, which can be seen below! \n\nLoad Libraries\n\nlibrary(magrittr) #pipes\nlibrary(ggplot2) #ploting \nlibrary(dplyr)\nlibrary(tidyr)\n\n\n\nImport Data\n\nds <- readxl::read_xlsx(path = \"../2020-01-04_my-start-to-r/Tableau 10 Training Practice Data.xlsx\"\n ,sheet = \"03 - WHO Life Expect & Mort\"\n )\n\n\n\nClean Names and Transform\n\nvarnames <- c(\"who_region\", \"country\", \"year\" , \"sex\" , \"life_expect_birth\" , \"neo_mort\"\n ,\"under_five_mort\" , \"health_expenditure\")\nnames(ds) <- varnames\n\n# Order Countries based on Life Expectancy at Birth\n\nds$country <- factor(ds$country, levels = ds$country[order(ds$life_expect_birth)]) \n\n#To \"Long\" Form\n\nds1 <- ds %>% pivot_longer(5:8)#select columns 5 throuh 8, leave new columns at default names\n\n# Set up labels for Facet, as well as function for Facet Labeller\n\nfacet_labels <- list(\n\"life_expect_birth\" = \"Life Expectancy at Birth \" \n,\"neo_mort\" = \"Neonatal Mortality Rate\" \n,\"under_five_mort\" = \"Under-Five Mortality Rate\"\n,\"health_expenditure\" = \"Health Expenditure per Capita (US$)\" )\n\nvariable_labeller <- function(variable,value){\n return(facet_labels[value])\n}\n\n\n\nGraphs\n\nhightlight_countries <- (c(\"Mauritania\", \"South Africa\")) \n\ng1 <- ds1 %>% filter(who_region == \"Africa\") %>% \n mutate(name = factor(name, levels = c(\"life_expect_birth\" , \"neo_mort\"\n ,\"under_five_mort\" , \"health_expenditure\"))\n ,highlight = country %in% hightlight_countries) %>% \n ggplot(aes(x = country, y = value, fill = highlight)) +\n geom_col(show.legend = FALSE) +\n coord_flip() +\n labs(\n title = \"World Bank Life Expectancy, Neonatal & Under-Five Mortality Rates, and Health Expenditure Analysis\"\n ,x = NULL\n ,y = NULL\n ) +\n facet_grid(~name, scales = \"free_x\",labeller = variable_labeller) +\n theme_bw() +\n geom_text(aes(label = round(value, 0)), hjust = 0) +\n scale_y_continuous(expand = expand_scale(mult = c(0,0.2))) +\n scale_fill_manual(values = c(\"TRUE\" = \"#fc8d59\", \"FALSE\" = \"#2b83ba\"))\ng1\n\n\n\n\n\n\n\n\nReusehttps://creativecommons.org/licenses/by/4.0/CitationBibTeX citation:@online{belanger2020,\n author = {Belanger, Kyle},\n title = {Facets and a {Lesson} in {Humility}},\n date = {2020-01-29},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2020. “Facets and a Lesson in Humility.”\nJanuary 29, 2020." + "text": "Today starts my attempt at sharing my R journey with the world! I have been learning R off and on now since late 2019, I have begun to take it much more serious as I work through my Data Analytics class at UCF. My love for all things numbers and graphs has really blossomed, and I am choosing to share that love with anyone who cares to read. I will not claim to be the best at R, or any programming for that matter, but these are my attempts. Each post in this serious will be replicated a graph created in Tableau from the book Tableau for Healthcare. Todays graph is a simple horizontal bar chart, in transferring to both a new blog site and computer I have unfortunately lost the original bar graph, but trust me the one I created looks just like it.\n\nLoad Libraries\n\nlibrary(tidyr)\nlibrary(magrittr)\nlibrary(ggplot2)\nlibrary(stringr)\nlibrary(dplyr)\n\n\n\nImport Data\n\nds <- readxl::read_excel(\n path = \"Tableau 10 Training Practice Data.xlsx\" \n ,sheet = \"02 - Patient Falls-Single Hosp\"\n )\n\n\n\nClean Data Names\n\n#should make reusable forumla at later time\nnames(ds) <- tolower(names(ds))\nnames(ds) <- str_replace_all(names(ds),\" \", \"_\")\n\n\n\nConvert Data to ‘Long Form’\n\nds1 <- ds %>% \n gather(\"patient_falls_no_injury_rate\" , \"patient_falls_with_injury_rate\"\n ,key = \"injury\" \n ,value = \"rate\" ) %>% \n mutate(injury = (injury == \"patient_falls_with_injury_rate\"))\n\n\n\nGraph 5.1\n\nb1 <- ds %>% \n ggplot(mapping = aes(x = reorder(type_of_care,total_patient_falls_rate ) , y = total_patient_falls_rate)) +\n geom_col(fill = \"#2b83ba\") + \n coord_flip() +\n scale_y_continuous(breaks = NULL) +\n theme(axis.ticks = element_blank()) +\n labs(title = \"Rate of Patient Falls (per 1,000 Pateint Days)\\nby Type of Care for FY2017\"\n ,x = NULL\n ,y = NULL\n ) +\n theme_classic() +\n geom_text(aes(label = format(total_patient_falls_rate, digits = 2)), nudge_y = -.25, color = \"white\")\n \nb1\n\n\n\n\n\n\n\n\nReusehttps://creativecommons.org/licenses/by/4.0/CitationBibTeX citation:@online{belanger2020,\n author = {Belanger, Kyle},\n title = {My {Start} to {R}},\n date = {2020-01-24},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2020. “My Start to R.” January 24, 2020." }, { "objectID": "posts/2020-02-10_line-graphs-and-interactivity/line-graphs-and-interactivity.html", @@ -125,6 +146,13 @@ "section": "plotly", "text": "plotly\nOne of the nice features of Tableau is the fact the graphs are interactive, while a good graph should speak for itself, end users love pretty things. I have been experimenting with Plotly, which has an open source package for R (as well as many other programming languages!). This example only just scratches the surface, but there will be many more to come!\n\ng2 <- ds1 %>% \n plot_ly(x = ~month, y = ~pct_tests_pos_for_influenza, type = \"scatter\", mode = \"lines\" \n ,color = ~fiscal_year\n ,colors = c(\"#a6611a\",\"#dfc27d\",\"#80cdc1\",\"#018571\")\n , hoverinfo = 'y') %>% \n layout(xaxis = list(\n title = \"\"\n )\n ,yaxis = list(\n title = \"% Tests (+) for Influenza\"\n )\n ,title = \"Flu Viral Surveillance: % Respiratory Specimens Positive for Influenza\"\n ,legend = list(\n x = 100\n ,y = 0.5\n ) \n \n )\n\ng2" }, + { + "objectID": "posts/2020-06-22_excel-data-multiple-headers/importing-excel-data-with-multiple-header-rows.html", + "href": "posts/2020-06-22_excel-data-multiple-headers/importing-excel-data-with-multiple-header-rows.html", + "title": "Importing Excel Data with Multiple Header Rows", + "section": "", + "text": "Problem\nRecently I tried to important some Microsoft Excel data into R, and ran into an issue were the data actually had two different header rows. The top row listed a group, and then the second row listed a category within that group. Searching goggle I couldn’t really find a good example of what I was looking for, so I am putting it here in hopes of helping someone else!\n\n\nExample Data\nI have created a small Excel file to demonstrate what I am talking about. Download it here. This is the data from Excel. \n\n\nCheck Data\nFirst we will read the file in using the package readxl and view the data without doing anything special to it.\n\nlibrary(readxl) # load the readxl library\nlibrary(tidyverse) # load the tidyverse for manipulating the data\nfile_path <- \"example_data.xlsx\" # set the file path\nds0 <- read_excel(file_path) # read the file\nds0\n\n# A tibble: 7 × 7\n Name `Test 1` ...3 ...4 `Test 2` ...6 ...7 \n <chr> <chr> <chr> <chr> <chr> <chr> <chr>\n1 <NA> Run 1 Run 2 Run 3 Run 1 Run 2 Run 3\n2 Max 22 23 24 25 26 27 \n3 Phoebe 34 34 32 34 51 12 \n4 Scamp 35 36 21 22 23 24 \n5 Chance 1234 1235 1236 1267 173 1233 \n6 Aimee 420 123 690 42 45 12 \n7 Kyle 22 23 25 26 67 54 \n\n\n\n\nNew Header Names\n\nStep 1\nFirst lets read back the data, this time however with some options. We will set the n_max equal to 2, to only read the first two rows, and set col_names to FALSE so we do not read the first row as headers.\n\nds1 <- read_excel(file_path, n_max = 2, col_names = FALSE)\nds1\n\n# A tibble: 2 × 7\n ...1 ...2 ...3 ...4 ...5 ...6 ...7 \n <chr> <chr> <chr> <chr> <chr> <chr> <chr>\n1 Name Test 1 <NA> <NA> Test 2 <NA> <NA> \n2 <NA> Run 1 Run 2 Run 3 Run 1 Run 2 Run 3\n\n\n\n\nStep 2\nNow that we have our headers lets first transpose them to a vertical matrix using the base function t(), then we will turn it back into a tibble to allow us to use tidyr fill function.\n\nnames <- ds1 %>%\n t() %>% #transpose to a matrix\n as_tibble() #back to tibble\nnames\n\n# A tibble: 7 × 2\n V1 V2 \n <chr> <chr>\n1 Name <NA> \n2 Test 1 Run 1\n3 <NA> Run 2\n4 <NA> Run 3\n5 Test 2 Run 1\n6 <NA> Run 2\n7 <NA> Run 3\n\n\nNote that tidyr fill can not work row wise, thus the need to flip the tibble so it is long vs wide.\n\n\nStep 3\nNow we use tidyr fill function to fill the NA’s with whatever value it finds above.\n\nnames <- names %>% fill(V1) #use dplyr fill to fill in the NA's\nnames\n\n# A tibble: 7 × 2\n V1 V2 \n <chr> <chr>\n1 Name <NA> \n2 Test 1 Run 1\n3 Test 1 Run 2\n4 Test 1 Run 3\n5 Test 2 Run 1\n6 Test 2 Run 2\n7 Test 2 Run 3\n\n\n\n\nStep 4\nThis is where my data differed from many of the examples I could find online. Because the second row is also a header we can not just get rid of them. We can solve this using paste() combined with dplyr mutate to form a new column that combines the first and second column.\n\nnames <- names %>%\n mutate(\n new_names = paste(V1,V2, sep = \"_\")\n )\nnames\n\n# A tibble: 7 × 3\n V1 V2 new_names \n <chr> <chr> <chr> \n1 Name <NA> Name_NA \n2 Test 1 Run 1 Test 1_Run 1\n3 Test 1 Run 2 Test 1_Run 2\n4 Test 1 Run 3 Test 1_Run 3\n5 Test 2 Run 1 Test 2_Run 1\n6 Test 2 Run 2 Test 2_Run 2\n7 Test 2 Run 3 Test 2_Run 3\n\n\n\n\nStep 4a\nOne more small clean up task, in the example data the first column header Name, did not have a second label, this has created a name with an NA attached. We can use stringr to remove this NA.\n\nnames <- names %>% mutate(across(new_names, ~str_remove_all(.,\"_NA\")))\nnames\n\n# A tibble: 7 × 3\n V1 V2 new_names \n <chr> <chr> <chr> \n1 Name <NA> Name \n2 Test 1 Run 1 Test 1_Run 1\n3 Test 1 Run 2 Test 1_Run 2\n4 Test 1 Run 3 Test 1_Run 3\n5 Test 2 Run 1 Test 2_Run 1\n6 Test 2 Run 2 Test 2_Run 2\n7 Test 2 Run 3 Test 2_Run 3\n\n\n\n\nStep 5\nNow that are new name column is the way we want it, we can use dpylrs pull to return a vector of just that column\n\nnames <- names %>% pull(new_names)\n\n\n\n\nFinal Data\nNow that we have a vector of column names lets read in the original file using our new names. We set the skip argument to 2, to skip the first two rows, and set col_names equal to our vector of names. Note the last step I used the janitor package to provide names in snake case (the default for the clean names function.)\n\nexample_data <- readxl::read_excel(file_path, col_names = names, skip = 2) %>%\n janitor::clean_names()\nexample_data\n\n# A tibble: 6 × 7\n name test_1_run_1 test_1_run_2 test_1_run_3 test_2_run_1 test_2_run_2\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl>\n1 Max 22 23 24 25 26\n2 Phoebe 34 34 32 34 51\n3 Scamp 35 36 21 22 23\n4 Chance 1234 1235 1236 1267 173\n5 Aimee 420 123 690 42 45\n6 Kyle 22 23 25 26 67\n# ℹ 1 more variable: test_2_run_3 <dbl>\n\n\n\n\nOther Help\nWhile searching for some solutions to my problem I found two good examples, however neither did exactly what I was trying to do.\n\nThis post by Lisa Deburine is pretty close to what I was trying to accomplish and gave me a good starting point. Read it here\nThis post by Alison Hill solves a simlar but slightly different problem. In her data the 2nd row is actually metadata not a second set of headers. Read it here\n\n\n\n\n\nReusehttps://creativecommons.org/licenses/by/4.0/CitationBibTeX citation:@online{belanger2020,\n author = {Belanger, Kyle},\n title = {Importing {Excel} {Data} with {Multiple} {Header} {Rows}},\n date = {2020-06-22},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2020. “Importing Excel Data with Multiple Header\nRows.” June 22, 2020." + }, { "objectID": "posts/2020-07-25_diabetes-data-collection-and-cleaning/diabetes-in-rural-north-carolina-data-collection-and-cleaning.html", "href": "posts/2020-07-25_diabetes-data-collection-and-cleaning/diabetes-in-rural-north-carolina-data-collection-and-cleaning.html", @@ -167,46 +195,11 @@ "section": "Diabetes Percentages", "text": "Diabetes Percentages\nThe final data set comes from the CDC Diabetes Atlas and contains the estimated prevalence of diabetes in each county of the United States, by year. The data set also includes the upper and lower estimated limits, see the previous post for an explanation of how these numbers are calculated. The data was downloaded by year, and then merged into one data set for the project.\nView greeter script here\n\n\n\nUS Diabetes Data\n\n\nYear\nCounty Fips\nDiabetes Percentage\nDiabetes Lower Limit\nDiabetes Upper Limit\n\n\n\n\n2010\n01001\n11.2\n8.8\n13.9\n\n\n2010\n01003\n10.2\n8.7\n11.9\n\n\n2010\n01005\n13.0\n10.6\n15.9\n\n\n2010\n01007\n10.6\n8.2\n13.3\n\n\n2010\n01009\n12.6\n9.8\n15.7\n\n\n2010\n01011\n16.1\n12.4\n20.4" }, - { - "objectID": "posts/2020-06-22_excel-data-multiple-headers/importing-excel-data-with-multiple-header-rows.html", - "href": "posts/2020-06-22_excel-data-multiple-headers/importing-excel-data-with-multiple-header-rows.html", - "title": "Importing Excel Data with Multiple Header Rows", - "section": "", - "text": "Problem\nRecently I tried to important some Microsoft Excel data into R, and ran into an issue were the data actually had two different header rows. The top row listed a group, and then the second row listed a category within that group. Searching goggle I couldn’t really find a good example of what I was looking for, so I am putting it here in hopes of helping someone else!\n\n\nExample Data\nI have created a small Excel file to demonstrate what I am talking about. Download it here. This is the data from Excel. \n\n\nCheck Data\nFirst we will read the file in using the package readxl and view the data without doing anything special to it.\n\nlibrary(readxl) # load the readxl library\nlibrary(tidyverse) # load the tidyverse for manipulating the data\nfile_path <- \"example_data.xlsx\" # set the file path\nds0 <- read_excel(file_path) # read the file\nds0\n\n# A tibble: 7 × 7\n Name `Test 1` ...3 ...4 `Test 2` ...6 ...7 \n <chr> <chr> <chr> <chr> <chr> <chr> <chr>\n1 <NA> Run 1 Run 2 Run 3 Run 1 Run 2 Run 3\n2 Max 22 23 24 25 26 27 \n3 Phoebe 34 34 32 34 51 12 \n4 Scamp 35 36 21 22 23 24 \n5 Chance 1234 1235 1236 1267 173 1233 \n6 Aimee 420 123 690 42 45 12 \n7 Kyle 22 23 25 26 67 54 \n\n\n\n\nNew Header Names\n\nStep 1\nFirst lets read back the data, this time however with some options. We will set the n_max equal to 2, to only read the first two rows, and set col_names to FALSE so we do not read the first row as headers.\n\nds1 <- read_excel(file_path, n_max = 2, col_names = FALSE)\nds1\n\n# A tibble: 2 × 7\n ...1 ...2 ...3 ...4 ...5 ...6 ...7 \n <chr> <chr> <chr> <chr> <chr> <chr> <chr>\n1 Name Test 1 <NA> <NA> Test 2 <NA> <NA> \n2 <NA> Run 1 Run 2 Run 3 Run 1 Run 2 Run 3\n\n\n\n\nStep 2\nNow that we have our headers lets first transpose them to a vertical matrix using the base function t(), then we will turn it back into a tibble to allow us to use tidyr fill function.\n\nnames <- ds1 %>%\n t() %>% #transpose to a matrix\n as_tibble() #back to tibble\nnames\n\n# A tibble: 7 × 2\n V1 V2 \n <chr> <chr>\n1 Name <NA> \n2 Test 1 Run 1\n3 <NA> Run 2\n4 <NA> Run 3\n5 Test 2 Run 1\n6 <NA> Run 2\n7 <NA> Run 3\n\n\nNote that tidyr fill can not work row wise, thus the need to flip the tibble so it is long vs wide.\n\n\nStep 3\nNow we use tidyr fill function to fill the NA’s with whatever value it finds above.\n\nnames <- names %>% fill(V1) #use dplyr fill to fill in the NA's\nnames\n\n# A tibble: 7 × 2\n V1 V2 \n <chr> <chr>\n1 Name <NA> \n2 Test 1 Run 1\n3 Test 1 Run 2\n4 Test 1 Run 3\n5 Test 2 Run 1\n6 Test 2 Run 2\n7 Test 2 Run 3\n\n\n\n\nStep 4\nThis is where my data differed from many of the examples I could find online. Because the second row is also a header we can not just get rid of them. We can solve this using paste() combined with dplyr mutate to form a new column that combines the first and second column.\n\nnames <- names %>%\n mutate(\n new_names = paste(V1,V2, sep = \"_\")\n )\nnames\n\n# A tibble: 7 × 3\n V1 V2 new_names \n <chr> <chr> <chr> \n1 Name <NA> Name_NA \n2 Test 1 Run 1 Test 1_Run 1\n3 Test 1 Run 2 Test 1_Run 2\n4 Test 1 Run 3 Test 1_Run 3\n5 Test 2 Run 1 Test 2_Run 1\n6 Test 2 Run 2 Test 2_Run 2\n7 Test 2 Run 3 Test 2_Run 3\n\n\n\n\nStep 4a\nOne more small clean up task, in the example data the first column header Name, did not have a second label, this has created a name with an NA attached. We can use stringr to remove this NA.\n\nnames <- names %>% mutate(across(new_names, ~str_remove_all(.,\"_NA\")))\nnames\n\n# A tibble: 7 × 3\n V1 V2 new_names \n <chr> <chr> <chr> \n1 Name <NA> Name \n2 Test 1 Run 1 Test 1_Run 1\n3 Test 1 Run 2 Test 1_Run 2\n4 Test 1 Run 3 Test 1_Run 3\n5 Test 2 Run 1 Test 2_Run 1\n6 Test 2 Run 2 Test 2_Run 2\n7 Test 2 Run 3 Test 2_Run 3\n\n\n\n\nStep 5\nNow that are new name column is the way we want it, we can use dpylrs pull to return a vector of just that column\n\nnames <- names %>% pull(new_names)\n\n\n\n\nFinal Data\nNow that we have a vector of column names lets read in the original file using our new names. We set the skip argument to 2, to skip the first two rows, and set col_names equal to our vector of names. Note the last step I used the janitor package to provide names in snake case (the default for the clean names function.)\n\nexample_data <- readxl::read_excel(file_path, col_names = names, skip = 2) %>%\n janitor::clean_names()\nexample_data\n\n# A tibble: 6 × 7\n name test_1_run_1 test_1_run_2 test_1_run_3 test_2_run_1 test_2_run_2\n <chr> <dbl> <dbl> <dbl> <dbl> <dbl>\n1 Max 22 23 24 25 26\n2 Phoebe 34 34 32 34 51\n3 Scamp 35 36 21 22 23\n4 Chance 1234 1235 1236 1267 173\n5 Aimee 420 123 690 42 45\n6 Kyle 22 23 25 26 67\n# ℹ 1 more variable: test_2_run_3 <dbl>\n\n\n\n\nOther Help\nWhile searching for some solutions to my problem I found two good examples, however neither did exactly what I was trying to do.\n\nThis post by Lisa Deburine is pretty close to what I was trying to accomplish and gave me a good starting point. Read it here\nThis post by Alison Hill solves a simlar but slightly different problem. In her data the 2nd row is actually metadata not a second set of headers. Read it here\n\n\n\n\n\nReusehttps://creativecommons.org/licenses/by/4.0/CitationBibTeX citation:@online{belanger2020,\n author = {Belanger, Kyle},\n title = {Importing {Excel} {Data} with {Multiple} {Header} {Rows}},\n date = {2020-06-22},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2020. “Importing Excel Data with Multiple Header\nRows.” June 22, 2020." - }, { "objectID": "posts/2021-02-26_tidytuesday-hbcu-enrollment/tidytuesday-2021-week-6-hbcu-enrolment.html", "href": "posts/2021-02-26_tidytuesday-hbcu-enrollment/tidytuesday-2021-week-6-hbcu-enrolment.html", "title": "TidyTuesday 2021 Week 6: HBCU Enrollment", "section": "", "text": "Introduction\nRecently I was struggling to find a data project to work on, I felt a bit stuck with some of my current projects, so I begun to scour the internet to find something to work on. I stumbled upon (TidyTuesday)[https://github.com/rfordatascience/tidytuesday] a weekly project where untidy data is posted from various sources, for the goal of practicing cleaning and visualizing. There is not right or wrong answers for TidyTuesday, this was exactly what I was looking for! This week (well by the time this was posted, a few weeks ago) the data set was about Historically Black Colleges and Universities. Within the posted data there were a few different data sets, I chose to work with the set dealing with High school Graduation rates, throughout this post I will explain my steps for cleaning and then present a few different graphs. It should also be noted that in the first section my code blocks will build upon themselves, so the same code will be duplicated as I add more steps to it.\n\n\nLoad Data\nIn this first block we will load some required libraries as well as load in the raw data. This dataset contains data for Highschool graduation rates by race. One thing to point out here is the use of import::from(), will its use here is a bit overkill, it was more for my practice. In this case I am importing the function %nin from the Hmisc package, which in the opposite of the function %in% from base R.\n\nlibrary(dplyr)\nlibrary(ggplot2)\n\nimport::from(Hmisc, `%nin%`)\n\nhs_students_raw <- readxl::read_xlsx(\"104.10.xlsx\", sheet = 1)\n\nglimpse(hs_students_raw)\n\nRows: 48\nColumns: 19\n$ Total <dbl> 1910…\n$ `Total, percent of all persons age 25 and over` <dbl> 13.5…\n$ `Standard Errors - Total, percent of all persons age 25 and over` <chr> \"(—)…\n$ White1 <chr> \"—\",…\n$ `Standard Errors - White1` <chr> \"(†)…\n$ Black1 <chr> \"—\",…\n$ `Standard Errors - Black1` <chr> \"(†)…\n$ Hispanic <chr> \"—\",…\n$ `Standard Errors - Hispanic` <chr> \"(†)…\n$ `Total - Asian/Pacific Islander` <chr> \"—\",…\n$ `Standard Errors - Total - Asian/Pacific Islander` <chr> \"(†)…\n$ `Asian/Pacific Islander - Asian` <chr> \"—\",…\n$ `Standard Errors - Asian/Pacific Islander - Asian` <chr> \"(†)…\n$ `Asian/Pacific Islander - Pacific Islander` <chr> \"—\",…\n$ `Standard Errors - Asian/Pacific Islander - Pacific Islander` <chr> \"(†)…\n$ `American Indian/\\r\\nAlaska Native` <chr> \"—\",…\n$ `Standard Errors - American Indian/\\r\\nAlaska Native` <chr> \"(†)…\n$ `Two or more race` <chr> \"—\",…\n$ `Standard Errors - Two or more race` <chr> \"(†)…\n\n\nNow we are going to start cleaning the data. First I am going to filter for years 1985 and up, prior to this year the data set is a bit spardic, so to keep it clean I am only going to look at 1985 and up. There are also 3 odd years (19103,19203,19303) that I am not sure what those are so I will remove that data as well.\n\nhs_students <- hs_students_raw %>% \n filter(Total >= 1985) %>% \n filter(Total %nin% c(19103, 19203, 19303))\n\nNext I am going to convert all columns to be numeric, because of some blanks in the original import all of the columns read in as characters instead of numeric.\n\nhs_students <- hs_students_raw %>% \n filter(Total >= 1985) %>% \n filter(Total %nin% c(19103, 19203, 19303)) %>% \n mutate(across(everything(), as.numeric))\n\nNext I am going to rename the columns. First I rename the column Total, into year, as this column holds the year! Then I use stringr::str_remove_all to remove the long phrase ‘percent of all persons age 25 and over’, as well as the number 1. For some reason the Black and White columns each have a number 1 at the end, I think this is for some sort of footnote but we will just remove it.\n\nhs_students <- hs_students_raw %>% \n filter(Total >= 1985) %>% \n filter(Total %nin% c(19103, 19203, 19303)) %>% \n mutate(across(everything(), as.numeric)) %>% \n rename(year = Total) %>% \n rename_with(\n ~stringr::str_remove_all(\n .\n ,\", percent of all persons age 25 and over|1\"\n )\n )\n\nThen I am going to drop the column ‘Total - Asian/Pacific Islander’, each of these races is stored in a seperate column so if I needed the total later for some reason I could calculate it. I am also going to drop the string “Asian/Pacific Islander -”, from the begin of each of those columns, so they will now tell me just which race each column refers too.\n\nhs_students <- hs_students_raw %>% \n filter(Total >= 1985) %>% \n filter(Total %nin% c(19103, 19203, 19303)) %>% \n mutate(across(everything(), as.numeric)) %>% \n rename(year = Total) %>% \n rename_with(\n ~stringr::str_remove_all(\n .\n ,\", percent of all persons age 25 and over|1\"\n )\n ) %>% \n select(-contains(\"Total - Asian/Pacific Islander\")) %>% \n rename_with(\n ~stringr::str_remove_all(\n .\n ,\"Asian/Pacific Islander - \"\n )\n )\n\nI now simply pivot the data longer. A nice trick I learned since I want to pivot everything expect the year column is to use the minus sign to select every column expect the year column in the pivot.\n\nhs_students_long <- hs_students %>% \n tidyr::pivot_longer(-year)\n\nWith the data now in long form I am going to separate the automatically generate name column into two columns titled, stat and race. The data contains both the percent that graduated and the standard error. Then I replace all the NA’s in the stat column with Total, as these are the total percentage and the other rows will be the standard error. Last I dropped the s from standard errors to make it singular.\n\nhs_students_long <- hs_students %>% \n tidyr::pivot_longer(-year) %>% \n tidyr::separate(name, c(\"stat\", \"race\"), sep = \"- \", fill = \"left\") %>% \n tidyr::replace_na(list(stat = \"Total\")) %>% \n mutate(\n across(\n stat\n ,~stringr::str_replace(\n .\n ,\"Standard Errors\"\n ,\"Standard Error\"\n )\n )\n )\n\nI know pivot the date back to wide form, and use the Janitor package to clean the column names. This puts them in lowercase with _ for spaces.\n\nhs_students_wide <- hs_students_long %>% \n tidyr::pivot_wider(names_from = stat, values_from = value) %>% \n janitor::clean_names()\n\nTo make graphing a bit easier with the scales package, I divide both columns by 100. We will see why in the graphs.\n\nhs_students_wide <- hs_students_long %>% \n tidyr::pivot_wider(names_from = stat, values_from = value) %>% \n janitor::clean_names() %>% \n mutate(across(total:standard_error, ~.x/100))\n\nIt’s now time to graph. Notice the use scales::label_percent() as the labels value for the y axis. If the numbers were left as the default values (75 vs 0.75) the percentages would have been 750%, which is obviously very wrong! I also use geom_ribbon to draw the standard error bars around each line. Notice the use of color = NA, by default the ribbon has outlines, I did not like this so doing color = NA turns them off. (It should be noted there are a few other solutions to turning them off but this seemed the easiest to me). Last we see the use of the aesthetics argument in scale_color_brewer. By setting this we match the color and fill to be the same color, without setting this, the colors of the error bars and lines don’t match!\n\nhs_students_wide <- hs_students_wide %>% \n mutate(\n ymax = total - standard_error\n ,ymin = total + standard_error\n )\n\ng1 <- hs_students_wide %>% \n filter(race != \"Total\") %>% \n ggplot(aes(x = year, y = total, group = race, color = race)) +\n geom_ribbon(aes(ymax = ymax, ymin = ymin, fill = race), alpha = 0.3, color = NA) +\n geom_line() +\n scale_x_continuous(breaks = seq(1985,2016,3)) +\n scale_y_continuous(labels = scales::label_percent()) +\n scale_color_brewer(palette = \"Dark2\", aesthetics = c(\"color\", \"fill\")) +\n theme_bw() +\n labs(\n x = NULL\n ,y = NULL\n ,title = glue::glue(\"Percentage of High School Graduates by Race\"\n ,\"\\n\"\n ,\"1985 - 2016\")\n ,color = \"Race\" \n ,fill = \"Race\"\n ) +\n theme(\n plot.title = element_text(hjust = 0.5)\n ,legend.title = element_text(hjust = 0.5)\n )\n \ng1\n\n\n\n\n\n\nLoad Male/Female Data\nNow the file also contains the same information but split by male and female. I am going to load in that data.\n\nmale_hs_raw <- readxl::read_excel(\"104.10.xlsx\", sheet = 3)\nfemale_hs_raw <- readxl::read_excel(\"104.10.xlsx\", sheet = 5)\n\nHere I will use the same manipulations as above, the only addition is adding a column for sex.\n\nmale_hs <- male_hs_raw %>% \n filter(Total >= 1985) %>% \n filter(Total %nin% c(19103, 19203, 19303)) %>% \n mutate(across(everything(), as.numeric)) %>% \n rename(year = Total) %>% \n rename_with(\n ~stringr::str_remove_all(\n .\n ,\", percent of all persons age 25 and over|1\"\n )\n ) %>% \n select(-contains(\"Total - Asian/Pacific Islander\")) %>% \n rename_with(\n ~stringr::str_remove_all(\n .\n ,\"Asian/Pacific Islander - \"\n )\n ) %>% \n tidyr::pivot_longer(-year) %>% \n tidyr::separate(name, c(\"stat\", \"race\"), sep = \"- \", fill = \"left\") %>% \n tidyr::replace_na(list(stat = \"Total\")) %>% \n mutate(\n across(\n stat\n ,~stringr::str_replace(\n .\n ,\"Standard Errors\"\n ,\"Standard Error\"\n )\n )\n ,sex = \"Male\"\n )\n\n\nfemale_hs <- female_hs_raw %>% \n filter(Total >= 1985) %>% \n filter(Total %nin% c(19103, 19203, 19303)) %>% \n mutate(across(everything(), as.numeric)) %>% \n rename(year = Total) %>% \n rename_with(\n ~stringr::str_remove_all(\n .\n ,\", percent of all persons age 25 and over|1\"\n )\n ) %>% \n select(-contains(\"Total - Asian/Pacific Islander\")) %>% \n rename_with(\n ~stringr::str_remove_all(\n .\n ,\"Asian/Pacific Islander - \"\n )\n ) %>% \n tidyr::pivot_longer(-year) %>% \n tidyr::separate(name, c(\"stat\", \"race\"), sep = \"- \", fill = \"left\") %>% \n tidyr::replace_na(list(stat = \"Total\")) %>% \n mutate(\n across(\n stat\n ,~stringr::str_replace(\n .\n ,\"Standard Errors\"\n ,\"Standard Error\"\n )\n )\n ,sex = \"Female\"\n )\n\nHere we will combine the two data frames and then pivot to our final graphing form.\n\nmale_female_hs_wide <- male_hs %>% \n bind_rows(female_hs) %>% \n tidyr::pivot_wider(names_from = stat, values_from = value) %>% \n janitor::clean_names() %>% \n mutate(across(total:standard_error, ~.x/100)) %>% \n mutate(\n ymax = total - standard_error\n ,ymin = total + standard_error\n )\n\nLets first graph the total for Male and Female graduation rates.\n\ng2 <- male_female_hs_wide %>% \n filter(race == \"Total\") %>% \n ggplot(aes(x = year, y = total, group = sex, color = sex)) +\n geom_ribbon(aes(ymax = ymax, ymin = ymin, fill = sex), alpha = 0.3, color = NA) +\n geom_line() +\n scale_x_continuous(breaks = seq(1985,2016,3)) +\n scale_y_continuous(labels = scales::label_percent()) +\n scale_color_brewer(palette = \"Dark2\", aesthetics = c(\"color\", \"fill\")) +\n theme_bw() +\n labs(\n x = NULL\n ,y = NULL\n ,title = glue::glue(\"Percentage of High School Graduates by Sex\"\n ,\"\\n\"\n ,\"1985 - 2016\")\n ,color = \"Sex\" \n ,fill = \"Sex\"\n ) +\n theme(\n plot.title = element_text(hjust = 0.5)\n ,legend.title = element_text(hjust = 0.5)\n )\n\ng2\n\n\n\n\nNow I am going to graph by Sex and Race.\n\nrace_filter <- c(\"White\", \"Black\", \"Hispanic\")\n\nmake_label <- function(label){\n # browser()\n result <- stringr::str_split(label, \"\\\\.\")\n unlist(lapply(result, function(x) paste(x[2],x[1])))\n}\n\n\ng2 <- male_female_hs_wide %>% \n filter(race %in% race_filter) %>% \n ggplot(aes(x = year, y = total, group = interaction(sex,race), color = interaction(sex,race))) +\n geom_ribbon(aes(ymax = ymax, ymin = ymin, fill = interaction(sex,race)), alpha = 0.3, color = NA) +\n geom_line() +\n scale_x_continuous(breaks = seq(1985,2016,3)) +\n scale_y_continuous(labels = scales::label_percent()) +\n scale_color_brewer(palette = \"Dark2\", aesthetics = c(\"color\", \"fill\"), labels = make_label) +\n theme_bw() +\n labs(\n x = NULL\n ,y = NULL\n ,title = glue::glue(\"Percentage of High School Graduates by Race and Sex\"\n ,\"\\n\"\n ,\"1985 - 2016\")\n ,color = \"Race & Sex\" \n ,fill = \"Race & Sex\"\n ) +\n theme(\n plot.title = element_text(hjust = 0.5)\n ,legend.title = element_text(hjust = 0.5)\n )\n\ng2\n\n\n\n\n\n\nConclusion\nWhile I am sure there is much more that could be done with this data this is where I am going to stop for today. Our graphs clearly show a divide in graduation rates by race, however Sex does not seem to have much of an effect on graduation rates.\n\n\n\n\nReusehttps://creativecommons.org/licenses/by/4.0/CitationBibTeX citation:@online{belanger2021,\n author = {Belanger, Kyle},\n title = {TidyTuesday 2021 {Week} 6: {HBCU} {Enrollment}},\n date = {2021-02-26},\n langid = {en}\n}\nFor attribution, please cite this work as:\nBelanger, Kyle. 2021. “TidyTuesday 2021 Week 6: HBCU\nEnrollment.” February 26, 2021." - }, - { - "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html", - "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html", - "title": "Converting From Blogdown to Distill", - "section": "", - "text": "I have since converted this blog to a quarto blog, but am leaving this post up in case anyone finds it useful" - }, - { - "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#code-folding", - "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#code-folding", - "title": "Converting From Blogdown to Distill", - "section": "Code Folding", - "text": "Code Folding\nWhen I converted my blog on 12/30/2020, code folding was not included as an option by default in distill. At that time, an excellent package called Codefolder added the functionality. Since going live with the blog, code folding has been added to distill.1 Code folding is available for either the whole document or individual code sections. The default caption is “Show Code”, but instead of typing code_folding=TRUE, you can provide a string to change the caption.\n\n# Some awesome code \n# That does awesome things" - }, - { - "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#customizing-the-home-page", - "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#customizing-the-home-page", - "title": "Converting From Blogdown to Distill", - "section": "Customizing the Home Page", - "text": "Customizing the Home Page\nBy default, a distill blog’s home page will be the blog index page. I chose to edit my home page to be a landing page for myself and then have the blog index as a separate page. When creating a new blog, this is the default YAML header for your index page.\n---\ntitle: \"New Site\"\nsite: distill::distill_website\nlisting: posts\n---\nThe critical piece here is the line site: distill::distill_website. This line is what is needed to render the website. For my home page, I decided to use the package Postcard, which is used to generate simple landing pages. I won’t go into every step as there is already a great post by Alison Hill on how to do that. However, I will point out the most crucial part of the new index page the YAML header needs to contain these two lines.\noutput:\n postcards::trestles\nsite: distill::distill_website" - }, - { - "objectID": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#footnotes", - "href": "posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.html#footnotes", - "title": "Converting From Blogdown to Distill", - "section": "Footnotes", - "text": "Footnotes\n\n\nNote that as of publishing, code folding is only available in the development version of distill↩︎" } ] \ No newline at end of file diff --git a/posts/2020-06-25_diabetes-prevalence-in-nc/apa.csl b/posts/2020-06-25_diabetes-prevalence-in-nc/apa.csl new file mode 100644 index 0000000..5ad5d3d --- /dev/null +++ b/posts/2020-06-25_diabetes-prevalence-in-nc/apa.csl @@ -0,0 +1,1718 @@ + + \ No newline at end of file diff --git a/posts/2020-06-25_diabetes-prevalence-in-nc/basic_bib.bib b/posts/2020-06-25_diabetes-prevalence-in-nc/basic_bib.bib new file mode 100644 index 0000000..9d989c9 --- /dev/null +++ b/posts/2020-06-25_diabetes-prevalence-in-nc/basic_bib.bib @@ -0,0 +1,57 @@ +% Encoding: UTF-8 + +@Electronic{CDCP2020, + author = {{Centers for Disease Control and Prevention}}, + organization = {{US Department of Health and Human Services}}, + title = {National Diabetes Statistics Report}, + url = {https://www.cdc.gov/diabetes/pdfs/data/statistics/national-diabetes-statistics-report.pdf}, + year = {2020}, +} + +@Article{Pierannunzi2012, + author = {Carol Pierannunzi and Machell Town and William Garvin and Frederick E. Shaw and Lina Balluz}, + journal = {Morbidity and Mortality Weekly Report}, + title = {Methodologic Changes in the Behavioral Risk Factor Surveillance System in 2011 and Potential Effects on Prevalence Estimates}, + year = {2012}, + number = {22}, + pages = {410 - 413}, + volume = {61}, + url = {https://www.cdc.gov/mmwr/pdf/wk/mm6122.pdf}, +} + +@Article{Rao2003, + author = {Rao JNK}, + title = {Small Area Estimation}, + year = {2003}, + url = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/0471722189.fmatter}, +} + +@Article{Barker2013, + author = {Lawrence E. Barker and Theodore J. Thompson and Karen A Kirtland and James P Boyle and Linda S Geiss and Mary M McCauley and Ann L. Albright}, + journal = {Journal of Data Science}, + title = {Bayesian Small Area Estimates of Diabetes Incidence by United States County, 2009}, + year = {2013}, + month = apr, + number = {1}, + pages = {269-280}, + volume = {11}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4537395/}, +} + +@Article{Klein2001, + author = {R J Klein and C A Schoenborn}, + journal = {Healthy People 2000 Stat Notes}, + title = {Age adjustment using the 2000 projected U.S. population}, + year = {2001}, + pages = {1-9}, + volume = {20}, +} + +@Electronic{ADA2015, + author = {{American Diabetes Asssociation}}, + title = {The Burden of Diabetes in North Carolina}, + url = {http://main.diabetes.org/dorg/PDFs/Advocacy/burden-of-diabetes/north-carolina.pdf}, + year = {2015}, +} + +@Comment{jabref-meta: databaseType:bibtex;} diff --git a/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-1.R b/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-1.R new file mode 100644 index 0000000..c0e1541 --- /dev/null +++ b/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-1.R @@ -0,0 +1,541 @@ +#These first few lines run only when the file is run in RStudio, !!NOT when an Rmd/Rnw file calls it!! +rm(list=ls(all=TRUE)) #Clear the variables from previous runs. +cat("\f") # clear console + +# ---- load-packages -------------------------------------------------- +# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path +library(magrittr) # enables piping : %>% +library(dplyr) # data wrangling +library(ggplot2) # graphs +library(tidyr) # data tidying +library(maps) +library(mapdata) +library(sf) +library(readr) + + +# ---- load-sources --------------------------------------------------- + + + +# ---- declare-globals ---------------------------------------------------- + +#set ggplot theme +ggplot2::theme_set(theme_bw()) + +# ---- load-data ------------------------------------------------------ + +# load the data, and have all column names in lowercase + +nc_diabetes_data_raw <- read_csv("https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/derived/nc-diabetes-data.csv") %>% + rename_all(tolower) + +us_diabetes_data_raw <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/62bdaa6971fbdff09214de7c013d40122abbe40d/data-public/raw/us_diabetes_totals.csv" + ,skip = 2) + +rural_counties <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/rural-counties.csv") + +county_centers_raw <- read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/nc_county_centers.csv", col_names = c("county", "lat","long")) + +diabetes_atlas_data_raw <- read_csv("https://raw.githubusercontent.com/mmmmtoasty19/nc-diabetes-epidemic-2020/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/raw/DiabetesAtlasData.csv" + ,col_types = cols(LowerLimit = col_skip(), + UpperLimit = col_skip(), + Percentage = col_double()), skip = 2) + + + + +# ---- load-map-data ---------------------------------------------------------- + +# load in both US State Map and NC County Map + +nc_counties_map_raw <- st_as_sf(map("county",region = "north carolina", plot = FALSE,fill = TRUE)) %>% + mutate_at("ID", ~stringr::str_remove(.,"north carolina,")) + +state_map_raw <- st_as_sf(map("state",plot = FALSE,fill = TRUE )) + +nc_cities <- st_as_sf(read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/nc_cities.csv"), + coords = c("long", "lat") + ,remove = FALSE + ,agr = "constant" + ,crs = 4326) + + + + +# ---- tweak-data -------------------------------------------------------------- + +county_centers <- county_centers_raw %>% + mutate_all(~stringr::str_replace_all(., + c("\\°" = "" + ,"\\+" = "" + ,"\\–" = "-" + ) + ) + ) %>% + mutate(across(c("lat","long"), ~iconv(.,from = 'UTF-8', to = 'ASCII//TRANSLIT')) + ,across(c("lat","long"),~stringr::str_remove_all(.,"\\?"))) %>% + mutate_at(c("lat","long"),as.numeric) %>% + mutate(across("long", ~(. * -1))) %>% + mutate_at("county", tolower) + + + + +us_diabetes_data <- us_diabetes_data_raw %>% + filter(Year >= 2000) %>% + select( "Year","Total - Percentage") %>% + rename(year = Year , us_pct = `Total - Percentage`) + +diabetes_atlas_data <- diabetes_atlas_data_raw %>% + mutate_at("State", tolower) %>% + filter(Year >= 2000) + +state_map_abb <- state_map_raw %>% + left_join(read_csv("https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/raw/b29bfd93b20b73a7000d349cb3b55fd0822afe76/data-public/metadata/state-abb.csv") %>% + mutate_at("state", tolower) + ,by = c("ID" = "state") ) + + + +# ---- merge-data --------------------------------------------------------- + +#join US totals to NC data + +nc_diabetes_data <- nc_diabetes_data_raw %>% + mutate_at("county", ~stringr::str_replace_all(.,"Mcdowell","McDowell")) %>% + mutate( + rural = county %in% rural_counties$rural_counties + ) %>% + mutate_at("county",tolower) %>% + left_join(us_diabetes_data) + + +nc_counties_map <- nc_counties_map_raw %>% + left_join(nc_diabetes_data, by = c("ID" = "county")) %>% + left_join(county_centers, by = c("ID" = "county")) %>% + rename( + center_long = long + ,center_lat = lat) + +state_map <- state_map_abb %>% + left_join(diabetes_atlas_data, by = c("ID" = "State")) %>% + rename_all(tolower) + + + +# ---- o-g1 ------------------------------------------------------------------ + + +us_diabetes_data <- us_diabetes_data %>% + mutate( + change = lead(us_pct) - us_pct + ,change = if_else(change > 0, TRUE, FALSE) + ) %>% + mutate_at("change", ~stringr::str_replace_na(.,"NA")) + + + +o_g1 <- us_diabetes_data %>% + ggplot(aes(x = year, y = us_pct)) + + geom_line(color= "#D95F02") + + # geom_line(aes(color = change, group = 1)) + + geom_point(shape = 21, size = 3,color= "#D95F02") + + # geom_point(aes(color = change),shape = 21, size = 3) + + scale_color_manual(values = c( + "TRUE" = "#D95F02" + ,"FALSE" = "#7570B3" + ), guide = FALSE) + + labs( + title = "Percentage of Diagnosed Diabetes in Adults (18+), National Level" + ,x = NULL + ,y = NULL + ,caption = "Note: Data from the CDC's National Health Interview Survey (NHIS)" + ) + +o_g1 + + + +# ---- s-g1 ----------------------------------------------------------------- + + +s_g1 <- state_map %>% + st_drop_geometry() %>% + ggplot(aes(x = year, y = percentage, color = region)) + + geom_line(aes(group = id ),alpha = 0.3,na.rm = TRUE) + + geom_smooth(method = "lm", se = FALSE) + + ggpmisc::stat_poly_eq(formula = y ~ + x , + aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), + parse = TRUE) + + geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") + + scale_color_brewer(palette = "Dark2" + ,direction = -1 + ,labels = snakecase::to_title_case + ) + + labs( + title = "Percentage of Diagnosed Diabetes in Adults (18+) \nby State and Region" + ,x = NULL + ,y = NULL + ,color = "Region" + ,caption = "Regions from US Census Bureau" + ) + +s_g1 + + + + +# ---- s-g2 --------------------------------------------------------------- + +s_g2 <- state_map %>% + st_drop_geometry() %>% + filter(region == "south") %>% + mutate_at("id", ~snakecase::to_title_case(.)) %>% + ggplot(aes(x = year, y = percentage)) + + geom_line(aes(group = id ),na.rm = TRUE, color= "#D95F02") + + gghighlight::gghighlight(id == "North Carolina", label_params = list(vjust = 3)) + + scale_y_continuous(breaks = seq(5,13,2)) + + scale_x_continuous(minor_breaks = seq(2000,2016,1)) + + labs( + title = "Percentage of Diagnosed Diabetes in Adults (18+) \nSouth Region" + ,x = NULL + ,y = NULL + ,caption = "Regions from US Census Bureau" + ) + + theme() + +s_g2 + + + +# ---- nc-g1 ---------------------------------------------------------------------- + +d1 <- nc_diabetes_data %>% + group_by(year) %>% + summarise( + pct = mean(percentage) + ,us_pct = mean(us_pct) + ) %>% + pivot_longer( + cols = c("pct", "us_pct") + ,names_to = "metric" + ,values_to = "values" + ) %>% + mutate( + metric = factor(metric + ,levels = c("pct","us_pct") + ,labels = c("NC", "National")) + ) + +nc_g1 <- d1 %>% + ggplot(aes(x = year, y = values, color = metric)) + + geom_line() + + geom_point(shape = 21, size = 3) + + geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") + + scale_y_continuous(labels = function(x) paste0(x, "%")) + + scale_color_brewer(palette = "Dark2") + + labs( + x = NULL + ,y = NULL + ,color = NULL + ,title = "Percent of Adults (20+) with Diagnosed Diabetes" + ) + +nc_g1 + +# ---- nc-data-aberration --------------------------------------------------- + +nc_g1a <- nc_diabetes_data %>% + ggplot(aes(x = year, y = percentage)) + + geom_line(aes(group = county),alpha = 0.4) + + labs( + x = NULL + ,y = NULL + ,color = NULL + ) + +nc_g1a + + +# ---- nc-g2 ----------------------------------------------------------------- + +d2 <- nc_diabetes_data %>% + select(-us_pct) %>% + mutate( + pct_rural = if_else(rural == TRUE, percentage, FALSE) + ,pct_urban = if_else(rural == FALSE, percentage, FALSE) + ) %>% + select(-countyfips,-percentage) %>% + group_by(year) %>% + summarise( + pct_rural = mean(pct_rural,na.rm = TRUE) + ,pct_urban = mean(pct_urban,na.rm = TRUE) + ) %>% left_join(us_diabetes_data) %>% + pivot_longer( + cols = c("us_pct", "pct_rural","pct_urban") + ,names_to = "metric" + ,values_to = "value" + ,values_drop_na = TRUE + ) %>% + mutate( + metric = factor(metric, + levels = c("pct_rural","pct_urban","us_pct") + ,labels = c("Rural","Urban","US") + ) + ) + +nc_g2 <- d2 %>% ggplot(aes(x = year, y = value, color = metric)) + + geom_line() + + geom_point(shape = 21, size = 3) + + geom_vline(xintercept = 2011, linetype = "dashed", color = "gray") + + scale_y_continuous(labels = function(x) paste0(x, "%")) + + scale_color_brewer(palette = "Dark2") + + labs( + x = NULL + ,y = NULL + ,color = NULL + ,title = "Percent of Adults (20+) with Diagnosed Diabetes \nDisplaying Rural vs Urban" + ) + +nc_g2 + +# ---- spaghetti-plot ---------------------------------------------------- + +g50 <- nc_diabetes_data %>% + filter(year < 2015) %>% + mutate( + rural = factor(rural + ,levels = c(TRUE,FALSE) + ,labels = c("Rural", "Urban") + ) + ) %>% + ggplot(aes(x = year, y = percentage, color = rural)) + + geom_line(aes(group = county),alpha = 0.3) + + geom_smooth(aes(group = rural), method = "loess", se= FALSE, size = 1.1) + + scale_color_brewer(palette = "Dark2") + + labs( + title = "Percent of Adults (20+) with Diagnosed Diabetes \nAll North Carolina Counties" + ,x = NULL + ,y = NULL + ,color = NULL + ) + +g50 + + +# ---- c-g1 -------------------------------------------------------------- + + +nc_counties_map_binned <- nc_counties_map %>% + filter(year < 2015) %>% + mutate( + bin = dlookr::binning(.$percentage, nbins = 6 ,type = "equal") + ,bin = forcats::fct_recode(bin + ,"6.5 - 7.9" = "[6.5,7.97]" + ,"8.0 - 9.4" = "(7.97,9.43]" + ,"9.5 - 10.9" = "(9.43,10.9]" + ,"11.0 - 12.4" = "(10.9,12.4]" + ,"12.5 - 13.8" = "(12.4,13.8]" + ,"13.9 - 15.3" = "(13.8,15.3]" + ) + ) + +c_g1 <- nc_counties_map_binned %>% + filter(year %in% c(2006,2014)) %>% + ggplot() + + geom_sf() + #blank geom_sf keeps gridlines from overlapping map + geom_sf(aes(fill = bin,color = rural)) + + geom_sf(data = nc_cities) + + ggrepel::geom_text_repel(data = nc_cities, + aes(x = long, y = lat, label = city) + ,nudge_y = c(-1,1,1,-1,1) + ,nudge_x = c(0,0,0,-1,0) + ) + + geom_text(data = . %>% filter(rural == TRUE) + ,aes(x = center_long, y = center_lat) + ,label = "R" + ,color = "#696969" + ) + + coord_sf(xlim = c(-84.5,-75.5), ylim = c(33.75,37)) + + facet_wrap(~year) + + scale_fill_viridis_d(alpha = 0.6, direction = -1) + + scale_color_manual( + values = c( + "FALSE" = "gray" + ,"TRUE" = "black" + ),guide = 'none') + + labs( + title = "Estimated Diabetes in Adults (20+) by County" + ,fill = "Percentage" + ,y = NULL + ,x = NULL + ) + + theme( + panel.background = element_rect(fill = "aliceblue") + ,panel.grid.major = element_line(color = "#D4D4D4", linetype = "dashed", + size = 0.5) + ,legend.position = "bottom" + ,plot.title = element_text(hjust = 0.5) + ) + +c_g1 + +# ---- county-distribution-histogram ------------------------------------ + +# Not USED +c_g1a <- nc_counties_map_binned %>% + mutate( + rural = factor(rural + ,levels = c(TRUE,FALSE) + ,labels = c("Rural", "Urban") + ) + ) %>% + filter(year %in% c(2006,2014)) %>% + ggplot(aes(x = bin, fill = rural)) + + geom_bar(stat = "count" + ,position = "dodge" + ) + + geom_text(aes(label=..count..) + ,position = position_dodge(width = 1) + ,stat = "count" + ,vjust = -0.1 + ,size = 5) + + facet_wrap(~year) + + scale_fill_brewer(palette = "Dark2") + + labs( + fill = NULL + ,x = NULL + ,y = NULL + ) + +c_g1a + +# ---- county-boxplot ---- + +c_g1c <- nc_counties_map %>% + mutate( + rural = factor(rural + ,levels = c(TRUE,FALSE) + ,labels = c("Rural", "Urban") + )) %>% + filter(year < 2015) %>% + ggplot(aes(x = year, y = percentage, group = interaction(year,rural), fill = rural)) + + geom_boxplot(alpha = 0.5) + + scale_fill_brewer(palette = "Dark2") + + scale_x_continuous(breaks = seq(2004,2014,2)) + + labs( + x = NULL + ,y = NULL + ,fill = NULL + ,title = "Distribution of Estimated Cases by County 2006 - 2014" + ) + +c_g1c + + + + +# ---- c-g4 --------------------------------------------------------------- +d3 <- nc_counties_map %>% + st_drop_geometry() %>% + filter(year %in% c(2006,2014)) %>% + select(-countyfips,-us_pct) %>% + pivot_wider(names_from = "year" + ,values_from = "percentage") %>% + mutate( + pct_p = `2014` - `2006` + ,pct_c = ((`2014` - `2006`)/`2006`) * 100 + ) %>% + left_join(nc_counties_map_raw) %>% + st_as_sf() + + +c_g4 <- d3 %>% + ggplot() + + geom_sf() + #blank geom_sf keeps gridlines from overlapping map + geom_sf(aes(fill = pct_c ,color = rural)) + + geom_sf(data = nc_cities) + + ggrepel::geom_text_repel(data = nc_cities, + aes(x = long, y = lat, label = city) + ,nudge_y = c(-1,1,1,-1,1) + ,nudge_x = c(0,0,0,-1,0) + ) + + geom_text(data = . %>% filter(rural == TRUE) + ,aes(x = center_long, y = center_lat) + ,label = "R" + ,color = "#696969" + ) + + # scale_fill_viridis_c(alpha = 0.6, direction = -1) + + scale_fill_gradient2( + low = "#d01c8b" + ,mid = "#f7f7f7" + ,high = "#4dac26" + ,midpoint = 0 + ) + + scale_color_manual( + values = c( + "FALSE" = "gray" + ,"TRUE" = "black" + ),guide = 'none') + + labs( + title = "Percentage Change of Diagnosed Diabetes 2006-2014" + ,fill = "Percentage" + ,y = NULL + ,x = NULL + ) + + theme( + panel.background = element_rect(fill = "aliceblue") + ,panel.grid.major = element_line(color = "#D4D4D4", linetype = "dashed", + size = 0.5) + ) + +c_g4 + + +# ---- pct_p-histogram ---------------------------------------------------------- + + + + +d4 <- d3 %>% + st_drop_geometry() %>% + mutate( + rural = factor(rural + ,levels = c(TRUE,FALSE) + ,labels = c("Rural", "Urban") + ) + ) + + +mean_d4 <- d4 %>% + group_by(rural) %>% + summarise(.groups = "keep" + ,pct_c = mean(pct_c) + ) + +g51 <- d4 %>% + ggplot(aes(x = pct_c, fill = rural, y = ..density.., color = rural)) + + geom_histogram(binwidth = 5, position = "identity", alpha = 0.3) + + geom_density(alpha = 0.5) + + facet_wrap(~rural, ncol = 1) + + geom_vline(aes(xintercept = pct_c), data = mean_d4) + + geom_text(aes(x = pct_c, y = 0.038, label = round(pct_c, 2)) + ,data = mean_d4 + ,hjust = -0.15 + ,size = 5 + ,color = "#000000") + + geom_vline(xintercept = 0, linetype = "dashed", color = "#696969") + + scale_color_brewer(palette = "Dark2", guide = NULL) + + scale_fill_brewer(palette = "Dark2", guide = NULL) + + labs( + x = "Percentage Change" + ,y = "Density" + ,fill = NULL + ) +g51 + + + + + diff --git a/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.qmd b/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.qmd new file mode 100644 index 0000000..78a3d11 --- /dev/null +++ b/posts/2020-06-25_diabetes-prevalence-in-nc/diabetes-in-rural-north-carolina-exploring-prevalence-trends.qmd @@ -0,0 +1,185 @@ +--- +title: "Diabetes in Rural North Carolina : Exploring Prevalence Trends" +subtitle: | + This post introduces the exploration of the Diabetes epidemic in North Carolina +date: 06-25-2020 +bibliography: basic_bib.bib +csl: apa.csl +code-fold: true +--- + +```{r, echo=FALSE} +knitr::read_chunk("diabetes-1.R") +``` + +# Update + +2022-15-03: Since this was posted the CDC has updated how county level diabetes prevalance is calculated. The data presented here is using previous calcualtions and may no longer be correct. More can be read [here](https://www.cdc.gov/diabetes/data/statistics/faqs.html) + + +# Abstract + +Diabetes is growing at an epidemic rate in the United States. In North Carolina alone, diabetes and prediabetes cost an estimated $10.9 billion each year [@ADA2015]. This post introduces the exploration of the Diabetes epidemic in North Carolina. Through a series of posts this project will examine various public data available on diabetes and explore possible solutions to address the rise of diabetes in North Carolina. This investigation stems from the Capstone project of my Health Care Informatics Masters program. This post will answer the following questions: + + +> 1. What is the overall trend of diabetes prevalence in the United States? + +> 2. What is the trend of diabetes at a State Level and how does diabetes prevalence vary by state and region? + +> 3. How do trends in diabetes prevalence vary across counties of North Carolina? + +> 4. In which counties of North Carolina does the largest change in diabetes prevalence occur? + +> 5. How does change in diabetes prevalence compare between rural and urban counties? + +# Enviroment + +___This section contains technical information for deeper analysis and reproduction. Casual readers are invited to skip it.___ + +Packages used in this report. + +```{r load-packages} +``` + +Definitions of global object (file paths, factor levels, object groups ) used throughout the report. + +```{r declare-globals} +``` + +# Data + +The data for this exploration comes from several sources: + +1. The Diabetes data set for state and county levels were sourced from the US Diabetes Surveillance System; Division of Diabetes Translation - [Centers for Disease Control and Prevention](www.cdc.gov/diabetes/data). The data was downloaded one year per file, and compiled into a single data set for analysis. + +2. The Diabetes data set for National level data were sourced from the CDC's National Health Interview Survey [(NHIS)](https://www.cdc.gov/nchs/nhis/index.htm) + +3. The list of rural counties was taken from The Office of Rural Health Policy, the list is available [here](https://www.hrsa.gov/sites/default/files/hrsa/ruralhealth/resources/forhpeligibleareas.pdf) + + +```{r load-data} +``` + + +```{r load-map-data} +``` + +# Data Manipulation + +The combined data used in this anaylsis can be downloaded [here](https://github.com/mmmmtoasty19/nc-diabetes-epidemic-2020/blob/master/data-public/derived/nc-diabetes-data.csv). +The only tweaks done here are to combine the rural counties column, and the data for creating maps. + +### Tweaks + + +```{r tweak-data} +``` + +### Merge + + +```{r merge-data} +``` + +# Overall - National Level + +```{r o-g1} +``` + +Overall, the national average for diagnosed diabetes sharply rose through the early 2000's, leveling off around 2010. These numbers however, are estimates based on the self-reported response to the CDC's National Health Interview Survey, and do not represent the actual confirmed diagnoses. The CDC estimates that 1 in 5 adults have undiagnosed diabetes, therefore the numbers reported by the NHIS are likely to underestimate the true prevalence [@CDCP2020]. + +# Overall - State Level + +State and County level data on diabetes prevalence are taken from the CDC's Behavioral Risk Factor Surveillance System [(BRFSS)](https://www.cdc.gov/brfss/index.html). These results are based on the question "Has a doctor, nurse, or other health professional ever told you that you have diabetes?". Women who only experienced diabetes during pregnancy were excluded from the counts. The BRFSS is an ongoing, monthly telephone survey of the non-institutionalized adults (aged 18 years or older) in each state. The year 2011 saw a major change to the methodology of the survey, which started to include homes without a landline phone. This change was expected to increase coverage of lower income, lower educational levels, and younger age groups, because these groups often exclusively rely on cellular telephones for personal communication.[@Pierannunzi2012] + + +```{r s-g1} +``` + +The above graph shows diabetes prevalence trends by state, grouped into regions based on the US Census classification [regions](https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf). While all regions of the United states show positive growth in diabetes prevalence, the south exhibits a slightly higher growth rate, as well as the highest prevalence. + +```{r s-g2} +``` + +When focusing on the south region, North Carolina falls close to the middle of diabetes prevalence. + +# Overall - North Carolina + +When examining the trajectory for North Carolina, we can see that it has been consistently higher than national average . We see that in 2016 there was a large spike in diagnosed cases; unfortunately this is the last available year so it is unclear whether the upward trend continues. The graph below compares state-level average to the national average. Notice that the trend line is slightly higher than in the previous graphs: this is due to the age cut offs used for National and State level data vs County Level data. Previous data used 18 years of age as a cutoff for classifying adults, whereas the county level data uses 20. Due to removing 18- and 19-year-olds from the population, who typically have less diagnosed cases of diabetes than those of older ages, the computed prevalence increases slightly. + +```{r nc-g1} +``` + +We see a spike in 2016, the last year for which the data are available. However, we should be careful with our interpretation of this pattern, because the examination of the county-level trajectories reveals an aberration in the trend that requires a more rigorous investigation. + + +```{r nc-data-aberration} +``` + +While all of North Carolina has a higher prevalence than the national average, rural counties have systematically higher prevalence of diabetes than urban counties. Note that after 2011 both Urban and Rural counties break the upward trend exhibited in the previous 5 years. This could be explained by the addition of cell phones to the BRFS Survey as many rural areas are often lower income areas and may only rely on a cell phone for communication. As mentioned previously there is an odd spike in case in 2016 that can’t be explained by current documentation. For the purpose of this evaluation 2016 will be excluded from the county level data since the odd trend can not be explained and no further data is available to determine if this is a real spike or could be attributed to methodology change or data quality. + + +```{r nc-g2} +``` + + +# By County - Geographical + +County level data first became available in 2004, three years of data is used to arrive at these estimates. For example, the 2006 estimates were computed using the data from 2005, 2006, and 2007 BRFS survey rounds. The county-level estimates were based on indirect model-dependent estimates using Bayesian multilevel modeling techniques[@Rao2003 ; @Barker2013]. This model-dependent approach employs a statistical model that "borrows strength" in making an estimate for one county from BRFSS data collected in other counties and states. Multilevel Binomial regression models with random effects of demographic variables (age 20-44, 45-64, >=65; race/ethnicity; sex) at the county-level were developed. Estimates were adjusted for age to the 2000 US standard population using age groups of 20-44, 45-64, and 65 or older[@Klein2001]. + +```{r spaghetti-plot} +``` + +When viewing all county trend lines together, we see that the loess line for both urban and rural follows a similar trend for the time period. + +The following graphs displays the total estimated prevalence of Diabetes in each off the 100 North Carolina counties. To keep the scaling consistent between the graphs, we binned the estimates into 6 intervals of the same size. Rural counties are highlighted with a stronger border line as well as a letter “R” in respective geographic centers. These graphs allow us to view geographical clusters of diabetes prevalence. + +```{r c-g1} +#| column: body-outset +``` + +The following box plot displays the distribution of estimated cases by county from 2006 to 2014. For all years of current data the mean of rural counties is higher then that of their Urban counterparts. + + +```{r county-boxplot} +``` + +# By County - Percent Change + +The following graphs display the overall change in estimated prevalence between 2006 to 2014. + +```{r c-g4} +``` + +The following chart displays the density curve of the percentage change for both rural and urban counties. It is notable that the mean of change for Urban counties is actually higher than the mean for rural counties. However, we also see that most change for both regions is positive growth. In fact only 16 rural, and 10 Urban counties experienced negative change in the given time frame. While 35 rural and 34 urban counties experience growth in the same period. + +```{r pct_p-histogram} +``` + + +# Conclusion and Next Steps + +The original hypothesis of this report was that rural counties were growing at a higher rate then there urban counterparts. Through out this post it has been shown that this hypothesis is incorrect, just being a rural county does not indicate diabetes growth, in fact the growth rate throughout North Carolina has been consistent. Further posts will explore other reasons for these trends, as the current post merely explores the trends and differences using data visualizations, a more rigorous and formal evaluation of these comparison is in order. + + +# Session information + +=========================================================================== + +For the sake of documentation and reproducibility, the current report was rendered in the following environment. Click the line below to expand. + +
+ Environment +```{r session-info} +if( requireNamespace("devtools", quietly = TRUE) ) { + devtools::session_info() +} else { + sessionInfo() +} +``` + +
+ +# References + + diff --git a/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.qmd b/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.qmd index 07a8ba6..c453a84 100644 --- a/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.qmd +++ b/posts/2021-01-12_blogdown-to-distill/creating-a-distill-blog.qmd @@ -3,6 +3,7 @@ title: "Converting From Blogdown to Distill" subtitle: | A meta post on transferring from a blogdown to distill blog site date: 01-12-2021 +date-modified: 10-12-2023 categories: - Distill --- diff --git a/posts/2022-07-28_making-maps-in-R/making-maps-in-r.R b/posts/2022-07-28_making-maps-in-R/making-maps-in-r.R new file mode 100644 index 0000000..794f0a6 --- /dev/null +++ b/posts/2022-07-28_making-maps-in-R/making-maps-in-r.R @@ -0,0 +1,96 @@ +#These first few lines run only when the file is run in RStudio, !!NOT when an Rmd/Rnw file calls it!! +rm(list=ls(all=TRUE)) #Clear the variables from previous runs. +cat("\f") # clear console + +# ---- load-packages -------------------------------------------------- +# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path +library(tidyverse) +library(ggmap) +library(highcharter) + + +# ---- load-sources --------------------------------------------------- + + + +# ---- declare-globals ---------------------------------------------------- + + + +# ---- load-data ------------------------------------------------------ + +# load data from Tidy Tuesday 3//01/22 +# https://github.com/rfordatascience/tidytuesday/blob/master/data/2022/2022-03-01/readme.md + +ev_stations <- tidytuesdayR::tt_load(2022, week = 9) %>% .$stations + +ds1 <- ev_stations %>% + filter(STATE == "NC") %>% + select(LATITUDE, LONGITUDE, STATION_NAME, FUEL_TYPE_CODE, STATE, CITY) %>% + rename(lat = LATITUDE, lon = LONGITUDE, name = STATION_NAME) + + +# ggmap ------------------------------------------------------------------- + +# Using GGMAP to create map of NC EV Staions + +nc_boundries <- c(-85, 33.5, -75, 37) + +get_stamenmap(bbox = nc_boundries, zoom = 7, map = "toner") %>% ggmap() + + geom_point(data = ds1, aes(x = lon, y = lat, fill = FUEL_TYPE_CODE), shape = 21, size = 3) + + scale_fill_brewer(palette = "Dark2") + + theme_void() + + theme(legend.position = "bottom" + ,plot.title = element_text(hjust = 0.5, margin=margin(0,0,30,0)) #center title and add space due to theme + ) + + guides( #use to move legend title to the top, default is to the side + fill = guide_legend(title.position = "top", title.hjust = 0.5) + ) + + labs( + title = "Alternate Fuel Stations in North Carolina" + ,fill = "Station Type" + ) + + + + + + + + +# create map -------------------------------------------------------------- + +hcmap("countries/us/us-nc-all", showInLegend = FALSE) + +colors <- c('#8c510a','#d8b365','#f6e8c3','#c7eae5','#5ab4ac','#01665e') + + + +ds1 <- ev_stations %>% + filter(STATE == "NC") %>% + select(LATITUDE, LONGITUDE, STATION_NAME, FUEL_TYPE_CODE) %>% + rename(lat = LATITUDE, lon = LONGITUDE, name = STATION_NAME) %>% + mutate(color = colorize(FUEL_TYPE_CODE, colors)) + + + +hc1 <- hcmap("countries/us/us-nc-all", showInLegend = FALSE + ,nullColor = "black" #use to set map background color + ,borderColor = "pink" #change the colors of the borders! + ) %>% + hc_add_series( + data = ds1 + ,type = "mappoint" + ,dataLabels = list(enabled = FALSE) + ,tooltip = list(pointFormat = "{point.name}") + ,marker = list(lineWidth = 0, radius = 3, symbol = 'circle') + ,hcaes(color = color + ,group = FUEL_TYPE_CODE + ) #use this to add the color and group to each point + ,color = colors # use this to set the color of the series in the legend + ) %>% + hc_title(text = "Alternate Fuel Stations in North Carolina") +#note no legend title. No default way to center legend title and I don't like the look of it + + +hc1 diff --git a/posts/2022-07-28_making-maps-in-R/making-maps-in-r.qmd b/posts/2022-07-28_making-maps-in-R/making-maps-in-r.qmd new file mode 100644 index 0000000..cc7acad --- /dev/null +++ b/posts/2022-07-28_making-maps-in-R/making-maps-in-r.qmd @@ -0,0 +1,18 @@ +--- +title: "Making Maps in R" +subtitle: | + A short description of the post. +date: 2022-07-28 +draft: true +--- + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = FALSE) +``` + +Distill is a publication format for scientific and technical writing, native to the web. + +Learn more about using Distill at . + +