From a3d95f8414857cb0da845f3acf671c89bdae0cfc Mon Sep 17 00:00:00 2001 From: Kyle Belanger Date: Fri, 11 Oct 2024 08:56:17 -0400 Subject: [PATCH] update taste test --- .../2024-09-27_coffee_taste_test/taste_test.R | 55 ++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/posts/2024-09-27_coffee_taste_test/taste_test.R b/posts/2024-09-27_coffee_taste_test/taste_test.R index f7f4477..f8d85a6 100644 --- a/posts/2024-09-27_coffee_taste_test/taste_test.R +++ b/posts/2024-09-27_coffee_taste_test/taste_test.R @@ -1,24 +1,24 @@ # ---- Clean Environment ---- rm(list = ls()) -# Load Packages ---- +# ---- Load Packages ---- box::use( readr[read_csv], + rlang[`!!`, sym] ) - -# Load Data ---- +# ---- Load Data ---- url <- "https://bit.ly/gacttCSV" ds_raw <- read_csv(url) -# Count NAs ---- +# ---- Count NAs ---- nrow(ds_raw) na_count <- ds_raw |> - dplyr::mutate(num_na = rowSums(is.na(ds))) |> + dplyr::mutate(num_na = rowSums(is.na(ds_raw))) |> dplyr::summarise( n = dplyr::n(), .by = num_na @@ -31,8 +31,9 @@ ds_raw |> dplyr::arrange(desc(value)) |> print(n = 50) -# Clean Data ---- - +# --- Clean Data ---- +#' Remove columns with a lot of NA's, selected only columns to allow for a pretty complete data set +#' At the end of selection, drop NA's so that all rows are fully complete. This will avoid having to infer any data for ML ds <- ds_raw |> dplyr::select( @@ -56,5 +57,45 @@ ds <- ds_raw |> ) |> tidyr::drop_na() +skimr::skim(ds) + +# ---- Split Multi choice Columns ---- + +#' Function to split columns that contain multiple choice answers all in one column +#' result of function is each column contains original column name, followed by the answer choice +#' 1 is True, 0 False for the original column containing that choice + +multi_choice_split <- function(ds, col) { + col <- sym(col) + + ds |> + dplyr::select(ID, !!col) |> + tidyr::separate_longer_delim(!!col, delim = ",") |> + dplyr::mutate(dplyr::across(!!col, ~stringr::str_trim(., side = "both"))) |> + dplyr::mutate(dplyr::across(!!col, snakecase::to_snake_case)) |> + dplyr::group_by(ID, !!col) |> + dplyr::summarise(n = dplyr::n()) |> + dplyr::ungroup() |> + tidyr::pivot_wider(names_from = !!col, values_from = n, names_prefix = glue::glue("{col}_")) |> + {\(.) {replace(.,is.na(.),0)}}() #anonymous function to use native pipe + +} + +#' purr map, maps over each column choice returning a data frame for that col, map returns a list of data frames +#' the use of purrr reduce takes the list, and left joins each data frame by the ID col. + +cols_to_split <- c("where_drink", "brew_method", "additions", "why_drink") + +ds_ml <- purrr::map(cols_to_split, \(x) multi_choice_split(ds, x)) |> + purrr::reduce(dplyr::left_join, by = "ID") |> + dplyr::left_join(ds, by = "ID") |> + dplyr::select(-dplyr::any_of(cols_to_split)) |> + dplyr::mutate(dplyr::across(dplyr::where(is.character) & !ID, as.factor)) # note use of ! to not select the ID column +# ---- Data Summaries ---- +#need to expand this section to better explore cleaned data +summarytools::freq(ds_ml) + + +# ----