updates chapter 4

2023-06-07 13:33:09 -04:00 · 2023-06-07 13:33:09 -04:00 · 55b5c1f09d
commit 55b5c1f09d
parent 65f773674d
4 changed files with 207 additions and 21 deletions
--- a/ML/2-modeling.R
+++ b/ML/2-modeling.R
@ -54,6 +54,8 @@ strata_table <- strata1 %>%
  dplyr::left_join(strata2) %>%
  dplyr::rename(Class = name)
 save(list = c("strata_table"), file = "figures/strata_table.Rda")
 # random forest classification -----------------------------------------------------------
@ -121,13 +123,68 @@ class_test_result_conf_matrix <- ys$conf_mat(
  class_test_results %>%  tune::collect_predictions()
  ,truth = ft4_dia
  ,estimate = .pred_class
-  )
+  ) %>%  autoplot(type = "heatmap")
 gp2$ggsave(
  here("figures","conf_matrix_class.emf")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
  ,device = devEMF::emf
 )
 gp2$ggsave(
  here("figures","conf_matrix_class.png")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
 )
 ys$accuracy(class_test_results %>%  tune::collect_predictions() ,truth = ft4_dia, estimate = .pred_class )
 class_test_results %>%
  workflows::extract_fit_parsnip() %>%
-  vip::vip(num_features = 10)
+  vip::vip()
 gp2$ggsave(
  here("figures","vip_class.emf")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
  ,device = devEMF::emf
 )
 gp2$ggsave(
  here("figures","vip_class.png")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
 )
 class_test_results %>%
  workflows::extract_fit_parsnip() %>%
  vip::vi() %>%
  dplyr::filter(!Variable == "TSH") %>%
  vip::vip()
 class_result_pred_ds <- class_test_results %>% tune::collect_predictions()
 ys$roc_auc(class_result_pred_ds, ft4_dia,.pred_Hypo , `.pred_Non-Hypo`, .pred_Hyper, `.pred_Non-Hyper`)
 roc_curve_class <-  ys$roc_curve(class_result_pred_ds, ft4_dia,.pred_Hypo , `.pred_Non-Hypo`, .pred_Hyper, `.pred_Non-Hyper`) %>%
  p$autoplot()
 gp2$ggsave(
  here("figures","roc_curve_class.emf")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
  ,device = devEMF::emf
 )
 gp2$ggsave(
  here("figures","roc_curve_class.png")
  ,width  = 7
  ,height = 7
  ,dpi    = 300
 )
 # x-boost- class ----------------------------------------------------------
--- a/chapter3.qmd
+++ b/chapter3.qmd
@ -2,11 +2,22 @@
 ## IRB
-Based on the information you submitted for this project, the Campbell University Institutional Review Board (Campbell IRB) determined this submission is Not Human Subjects Research as defined by 45 CFR 46.102(e).
+Based on the information you submitted for this project, the Campbell
 University Institutional Review Board (Campbell IRB) determined this
 submission is Not Human Subjects Research as defined by 45 CFR
 46.102(e).
 ## Population and Data
-This study used the Medical Information Mart for Intensive Care (MIMIC) database [@johnsonalistair]. MIMIC (Medical Information Mart for Intensive Care) is an extensive, freely-available database comprising de-identified health-related data from patients who were admitted to the critical care units of the Beth Israel Deaconess Medical Center. The database contains many different types of information, but only data from the patients and laboratory events table are used in this study. The study uses version IV of the database, comprising data from 2008 - 2019.
+This study used the Medical Information Mart for Intensive Care (MIMIC)
 database [@johnsonalistair]. MIMIC (Medical Information Mart for
 Intensive Care) is an extensive, freely-available database comprising
 de-identified health-related data from patients who were admitted to the
 critical care units of the Beth Israel Deaconess Medical Center. The
 database contains many different types of information, but only data
 from the patients and laboratory events table are used in this study.
 The study uses version IV of the database, comprising data from 2008 -
 2019.
 ## Data Variables and Outcomes
@ -17,19 +28,37 @@ source(here::here("ML","1-data-exploration.R"))
 ```
-A total of 18 variables were chosen for this study. The age and gender of the patient were pulled from the patient table in the MIMIC database. While this database contains some additional demographic information, it is incomplete and thus unusable for this study. 15 lab values were selected for this study, this includes:
+A total of 18 variables were chosen for this study. The age and gender
 of the patient were pulled from the patient table in the MIMIC database.
 While this database contains some additional demographic information, it
 is incomplete and thus unusable for this study. 15 lab values were
 selected for this study, this includes:
-   **BMP**: BUN, bicarbonate, calcium, chloride, creatinine, glucose, potassium, sodium
+-   **BMP**: BUN, bicarbonate, calcium, chloride, creatinine, glucose,
    potassium, sodium
-   **CBC**: Hematocrit, hemoglobin, platelet count, red blood cell count, white blood cell count
+-   **CBC**: Hematocrit, hemoglobin, platelet count, red blood cell
    count, white blood cell count
 -   TSH
 -   Free T4
-The unique patient id and chart time were also retained for identifying each sample. Each sample contains one set of 15 lab values for each patient. Patients may have several samples in the data set run at different times. Rows were retained as long as they had less than three missing results. These missing results can be filled in by imputation later in the process. Samples were also filtered for those with TSH above or below the reference range of 0.27 - 4.2 uIU/mL. These represent samples that would have reflexed for Free T4 testing. After filtering, the final data set contained `r nrow(ds1)` rows.
+The unique patient id and chart time were also retained for identifying
 each sample. Each sample contains one set of 15 lab values for each
 patient. Patients may have several samples in the data set run at
 different times. Rows were retained as long as they had less than three
 missing results. These missing results can be filled in by imputation
 later in the process. Samples were also filtered for those with TSH
 above or below the reference range of 0.27 - 4.2 uIU/mL. These represent
 samples that would have reflexed for Free T4 testing. After filtering,
 the final data set contained `r nrow(ds1)` rows.
-Once the final data set was collected, an additional column was created for the outcome variable to determine if the Free T4 value was diagnostic. This outcome variable was used for building classification models. The classification variable was not used in regression models. @tbl-outcome_var shows how the outcomes were added
+Once the final data set was collected, an additional column was created
 for the outcome variable to determine if the Free T4 value was
 diagnostic. This outcome variable was used for building classification
 models. The classification variable was not used in regression models.
@tbl-outcome_var shows how the outcomes were added
 | TSH Value     | Free T4 Value | Outcome             |
 |---------------|---------------|---------------------|
@ -40,7 +69,14 @@ Once the final data set was collected, an additional column was created for the
 : Outcome Variable {#tbl-outcome_var}
-. @tbl-data_summary shows the summary statistics of each variable selected for the study. Each numeric variable is listed with the percent missing, median, and interquartile range (IQR). The data set is weighted toward elevated TSH levels, with 80% of values falling into that category. Glucose and Calcium have several missing values at `r gtsummary::inline_text(summary_tbl, variable = GLU, column = n)` and `r gtsummary::inline_text(summary_tbl, variable = CA, column = n)`, respectively.
+. @tbl-data_summary shows the summary statistics of each variable
 selected for the study. Each numeric variable is listed with the percent
 missing, median, and interquartile range (IQR). The data set is weighted
 toward elevated TSH levels, with 80% of values falling into that
 category. Glucose and Calcium have several missing values at
 `r gtsummary::inline_text(summary_tbl, variable = GLU, column = n)` and
 `r gtsummary::inline_text(summary_tbl, variable = CA, column = n)`,
 respectively.
 ```{r}
 #| label: tbl-data_summary
@ -52,19 +88,41 @@ summary_tbl %>% gtsummary$as_kable()
 ## Data Inspection
-By examining @tbl-data_summary several important data set characteristics quickly come to light without explanation. The median age across the data set, as a whole, is quite similar, with a median age across all categories of 62.5. Females are better represented in the data set, with higher percentages in all categories. Across all categories, the median values for each lab result are pretty similar. The expectation for this is Red Blood cells, which show more considerable variation across the various categories.
+By examining @tbl-data_summary several important data set
 characteristics quickly come to light without explanation. The median
 age across the data set, as a whole, is quite similar, with a median age
 across all categories of 62.5. Females are better represented in the
 data set, with higher percentages in all categories. Across all
 categories, the median values for each lab result are pretty similar.
 The expectation for this is Red Blood cells, which show more
 considerable variation across the various categories.
-![Distribution of Variables](figures/distrubution_histo){#fig-distro_histo}
+![Distribution of
 Variables](figures/distrubution_histo){#fig-distro_histo}
-When examining @fig-distro_histo, many clinical chemistry values do not show a standard distribution. However, the hematology results typically do appear to follow a standard distribution. While not a problem for most tree-based classification models, many regression models perform better with standard variables. Standardizing variables provides a common comparable unit of measure across all the variables [@boehmke2020]. Since lab values do not contain negative numbers, all numeric values will be log-transformed to bring them to normal distributions.
+When examining @fig-distro_histo, many clinical chemistry values do not
 show a standard distribution. However, the hematology results typically
 do appear to follow a standard distribution. While not a problem for
 most tree-based classification models, many regression models perform
 better with standard variables. Standardizing variables provides a
 common comparable unit of measure across all the variables
 [@boehmke2020]. Since lab values do not contain negative numbers, all
 numeric values will be log-transformed to bring them to normal
 distributions.
 ![Variable Correlation Plot](figures/corr_plot){#fig-corr_plot}
-@fig-corr_plot shows a high correlation between Hemoglobin, hematocrit, and Red Blood Cell values (as expected). While high correlation does not lead to model issues, it can cause unnecessary computations with little value. However, due to the small number of variables, the computation burden is not expected to cause delays, and thus the variables will not be removed.
+@fig-corr_plot shows a high correlation between Hemoglobin, hematocrit,
 and Red Blood Cell values (as expected). While high correlation does not
 lead to model issues, it can cause unnecessary computations with little
 value. However, due to the small number of variables, the computation
 burden is not expected to cause delays, and thus the variables will not
 be removed.
 ## Data Tools
-All data handling and modeling were performed using R and R Studio. The current report was rendered in the following environment.
+All data handling and modeling were performed using R and R Studio. The
 current report was rendered in the following environment.
 ```{r}
 #| label: tbl-platform-info
@ -118,7 +176,14 @@ knitr::kable(
 ## Model Selection
-Both classification and regression models were screened using a random grid search to tune hyperparameters. The models were tested against the training data set to find the best-fit model. @fig-reg-screen shows the results of the model screening for regression models, using root mean square error (RMSE) as the ranking method. Random Forest models and boosted trees performed similarly and were selected for further testing. A full grid search was performed on both models, with a Random Forest model as the final selection. The final hyperparameters selected were:
+Both classification and regression models were screened using a random
 grid search to tune hyperparameters. The models were tested against the
 training data set to find the best-fit model. @fig-reg-screen shows the
 results of the model screening for regression models, using root mean
 square error (RMSE) as the ranking method. Random Forest models and
 boosted trees performed similarly and were selected for further testing.
 A full grid search was performed on both models, with a Random Forest
 model as the final selection. The final hyperparameters selected were:
 -   mtry: 8
@ -128,7 +193,12 @@ Both classification and regression models were screened using a random grid sear
 ![Regression Model Screen](figures/reg_screen){#fig-reg-screen}
-@fig-class-screen shows the results of the model screen for classification models using accuracy as the ranking method. As with regression models, boosted trees and random forest models performed the best. After completing a full grid search of both model types, a random forest model was again chosen as the final model. The final hyperparameters for the model selected were:
+@fig-class-screen shows the results of the model screen for
 classification models using accuracy as the ranking method. As with
 regression models, boosted trees and random forest models performed the
 best. After completing a full grid search of both model types, a random
 forest model was again chosen as the final model. The final
 hyperparameters for the model selected were:
 -   mtry: 8
--- a/chapter4.qmd
+++ b/chapter4.qmd
@ -2,10 +2,9 @@
 ```{r}
 #| include: false
 #| cache: true
 library(magrittr)
-load("test.Rda")
+load(here:here("figures", "strata_table.Rda"))
 ```
@ -29,7 +28,7 @@ strata_table %>% knitr::kable()
 First, the report shows the ability of classification algorithms to
 predict whether Free T4 will be diagnostic, with the prediction quality
 measured by Area Under Curve (AUC) and accuracy. Data regarding the
-univariate association between each predictor analyte and the Free T4
+importance association between each predictor analyte and the Free T4
 Diagnostic value is then presented. Finally, data is presented with the
 extent to which FT4 can be predicted by examining the correlation
 statistics denoting the relationship between measured and predicted Free
@ -42,4 +41,35 @@ numerical laboratory results is often just whether the results fall
 within the normal reference range [@luo2016]. In the case of Free T4
 reflex testing, the results will either fall within the normal range
 indicating the Free T4 is not diagnostic of Hyper or Hypo Throydism, or
-they will fall outside those ranges indicating they are diagnostic.
+they will fall outside those ranges indicating they are diagnostic. The
 final model achieved an accuracy of 0.796 and an AUC of 0.918.
@fig-roc_curve provides ROC curves for each of the four outcome classes.
 ![ROC curves for each of the four outcome
 classes](figures/roc_curve_class){#fig-roc_curve}
@fig-conf-matrix-class shows the confusion matrix of the final testing
 data. Of the 2269 total results, 1805 were predicted correctly, leaving
 464 incorrectly predicted results. Of the incorrectly predicted results,
 72 results predicted a diagnostic Free T4 when the correct result was
 non-diagnostic. 392 of the incorrectly predicted results were predicted
 as non-diagnostic when the correct result was diagnostic.
 ![Final Model Confusion
 Matrix](figures/conf_matrix_class){#fig-conf-matrix-class}
 ## Contributions of Individual Analytes
 Understanding how an ML model makes predictions helps build trust in the
 model and is the fundamental idea of the emerging field of interpretable
 machine learning (IML) [@greenwell2020]. @fig-vip-class shows the
 importance of features in the final model. Importance can be defined as
 the extent to which a feature has a \"meaningful\" impact on the
 predicted outcome [@laan2006]. As expected, TSH is the leading variable
 in importance rankings, leading all other variables by over 2000's
 points. The following three variables are all parts of a Complete Blood
 Count (CBC), followed by the patients glucose value.
 ![Variable Importance Plot](figures/vip_class){#fig-vip-class}
 ## Predictability of Free T4 Results (Regression) 
--- a/references.bib
+++ b/references.bib
@ -350,3 +350,32 @@ DOI: 10.13026/S6N6-XD98}
 	note = {PMID: 27329638},
 	langid = {eng}
 }
@article{greenwell2020,
 	title = {Variable Importance Plots{\textemdash}An Introduction to the vip Package},
 	author = {Greenwell, {Brandon,M.} and Boehmke, {Bradley,C.}},
 	year = {2020},
 	date = {2020},
 	journal = {The R Journal},
 	pages = {343},
 	volume = {12},
 	number = {1},
 	doi = {10.32614/RJ-2020-013},
 	url = {https://journal.r-project.org/archive/2020/RJ-2020-013/index.html},
 	langid = {en}
 }
@article{laan2006,
 	title = {Statistical Inference for Variable Importance},
 	author = {Laan, Mark J. van der},
 	year = {2006},
 	month = {02},
 	date = {2006-02-20},
 	journal = {The International Journal of Biostatistics},
 	volume = {2},
 	number = {1},
 	doi = {10.2202/1557-4679.1008},
 	url = {https://www.degruyter.com/document/doi/10.2202/1557-4679.1008/html},
 	note = {Publisher: De Gruyter},
 	langid = {en}
 }