update taste test

This commit is contained in:
Kyle Belanger 2024-10-11 08:56:17 -04:00
parent 84e4777430
commit a3d95f8414

View file

@ -1,24 +1,24 @@
# ---- Clean Environment ----
rm(list = ls())
# Load Packages ----
# ---- Load Packages ----
box::use(
readr[read_csv],
rlang[`!!`, sym]
)
# Load Data ----
# ---- Load Data ----
url <- "https://bit.ly/gacttCSV"
ds_raw <- read_csv(url)
# Count NAs ----
# ---- Count NAs ----
nrow(ds_raw)
na_count <- ds_raw |>
dplyr::mutate(num_na = rowSums(is.na(ds))) |>
dplyr::mutate(num_na = rowSums(is.na(ds_raw))) |>
dplyr::summarise(
n = dplyr::n(),
.by = num_na
@ -31,8 +31,9 @@ ds_raw |>
dplyr::arrange(desc(value)) |>
print(n = 50)
# Clean Data ----
# --- Clean Data ----
#' Remove columns with a lot of NA's, selected only columns to allow for a pretty complete data set
#' At the end of selection, drop NA's so that all rows are fully complete. This will avoid having to infer any data for ML
ds <- ds_raw |>
dplyr::select(
@ -56,5 +57,45 @@ ds <- ds_raw |>
) |>
tidyr::drop_na()
skimr::skim(ds)
# ---- Split Multi choice Columns ----
#' Function to split columns that contain multiple choice answers all in one column
#' result of function is each column contains original column name, followed by the answer choice
#' 1 is True, 0 False for the original column containing that choice
multi_choice_split <- function(ds, col) {
col <- sym(col)
ds |>
dplyr::select(ID, !!col) |>
tidyr::separate_longer_delim(!!col, delim = ",") |>
dplyr::mutate(dplyr::across(!!col, ~stringr::str_trim(., side = "both"))) |>
dplyr::mutate(dplyr::across(!!col, snakecase::to_snake_case)) |>
dplyr::group_by(ID, !!col) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::ungroup() |>
tidyr::pivot_wider(names_from = !!col, values_from = n, names_prefix = glue::glue("{col}_")) |>
{\(.) {replace(.,is.na(.),0)}}() #anonymous function to use native pipe
}
#' purr map, maps over each column choice returning a data frame for that col, map returns a list of data frames
#' the use of purrr reduce takes the list, and left joins each data frame by the ID col.
cols_to_split <- c("where_drink", "brew_method", "additions", "why_drink")
ds_ml <- purrr::map(cols_to_split, \(x) multi_choice_split(ds, x)) |>
purrr::reduce(dplyr::left_join, by = "ID") |>
dplyr::left_join(ds, by = "ID") |>
dplyr::select(-dplyr::any_of(cols_to_split)) |>
dplyr::mutate(dplyr::across(dplyr::where(is.character) & !ID, as.factor)) # note use of ! to not select the ID column
# ---- Data Summaries ----
#need to expand this section to better explore cleaned data
summarytools::freq(ds_ml)
# ----