#load required packages
library(tidyverse)
library(tidymodels)
library(here)
library(rpart)
library(glmnet)
library(ranger)
library(rpart.plot)
library(parallel)
library(vip)
Flu Analysis
Machine Learning
This file contains data processing and analysis, including some implementation of machine learning. It is conducted on the dataset from McKay et al 2020, found here.
Load and Process Data
#load data
<- readr::read_rds(here("fluanalysis", "data", "flu_data_clean.RDS"))
flu_data glimpse(flu_data)
Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y~
$ ChestCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ ChillsSweats <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, ~
$ NasalCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ CoughYN <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, ~
$ Sneeze <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, ~
$ Fatigue <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ SubjectiveFever <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes~
$ Headache <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes~
$ Weakness <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi~
$ WeaknessYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ CoughIntensity <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve~
$ CoughYN2 <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes~
$ Myalgia <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, ~
$ MyalgiaYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ RunnyNose <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No~
$ AbPain <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N~
$ ChestPain <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, ~
$ Diarrhea <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,~
$ EyePn <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye~
$ Insomnia <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y~
$ ItchyEye <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,~
$ Nausea <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y~
$ EarPn <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y~
$ Hearing <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,~
$ Pharyngitis <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, ~
$ Breathless <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, ~
$ ToothPn <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N~
$ Vision <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, ~
$ Vomit <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N~
$ Wheeze <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N~
$ BodyTemp <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, ~
I now want to remove a few correlated variables.
#remove yes/no variables
<- flu_data %>%
flu_data select(-ends_with("YN"))
#remove near-zero variance predictors (hearing and vision)
<- flu_data %>%
flu_data select(-c(Hearing, Vision))
#view new data
glimpse(flu_data)
Rows: 730
Columns: 27
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y~
$ ChestCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ ChillsSweats <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, ~
$ NasalCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ Sneeze <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, ~
$ Fatigue <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ SubjectiveFever <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes~
$ Headache <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes~
$ Weakness <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi~
$ CoughIntensity <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve~
$ CoughYN2 <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes~
$ Myalgia <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, ~
$ RunnyNose <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No~
$ AbPain <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N~
$ ChestPain <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, ~
$ Diarrhea <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,~
$ EyePn <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye~
$ Insomnia <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y~
$ ItchyEye <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,~
$ Nausea <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y~
$ EarPn <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y~
$ Pharyngitis <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, ~
$ Breathless <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, ~
$ ToothPn <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N~
$ Vomit <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N~
$ Wheeze <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N~
$ BodyTemp <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, ~
Data preparation
# set seed for reproducible analysis
set.seed(123)
#split data into 70% training, 30% testing groups
<-initial_split(flu_data, prop = 7/10, strata = BodyTemp)
flu_split
#New split dataframes
<- training(flu_split)
flu_train <- testing(flu_split) flu_test
Cross validation
#5-fold cross-validation, 5 times repeated, stratified by BodyTemp
<- vfold_cv(flu_train, v = 5, repeats = 5, strata = BodyTemp) folds
Recipe
#recipe for all predictors
<-
global_recipe recipe(BodyTemp ~ ., data = flu_train) %>%
step_dummy(all_nominal(), -all_outcomes())
#null recipe
<-
null_recipe recipe(BodyTemp ~ 1, data = flu_train)
#set model engine
<- linear_reg() %>%
linear_reg set_engine("lm") %>%
set_mode("regression")
#create workflow
<- workflow() %>%
null_workflow add_model(linear_reg) %>%
add_recipe(null_recipe)
#fit folds data
<- fit_resamples(null_workflow, resamples = folds)
null_model
#null model performance
<- collect_metrics(null_model)
null_performance tibble(null_performance)
# A tibble: 2 x 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 1.21 25 0.0177 Preprocessor1_Model1
2 rsq standard NaN 0 NA Preprocessor1_Model1
Tree Model
#create recipe for tree model
<- decision_tree(
tree_model cost_complexity = tune(),
tree_depth = tune()
%>%
) set_engine("rpart") %>%
set_mode("regression")
#tree workflow
<- workflow() %>%
tree_workflow add_model(tree_model) %>%
add_recipe(global_recipe)
#setup tree tuning grid
<- grid_regular(cost_complexity(),
tree_grid tree_depth(),
levels = 5)
#fit to folds data
<- tune_grid(tree_workflow, resamples = folds, grid = tree_grid)
tree_fit
#null model performance
<- collect_metrics(tree_fit)
tree_performance tibble(tree_performance)
# A tibble: 50 x 8
cost_complexity tree_depth .metric .estimator mean n std_err .config
<dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 1 rmse standard 1.19 25 0.0181 Prepro~
2 0.0000000001 1 rsq standard 0.0361 25 0.00422 Prepro~
3 0.0000000178 1 rmse standard 1.19 25 0.0181 Prepro~
4 0.0000000178 1 rsq standard 0.0361 25 0.00422 Prepro~
5 0.00000316 1 rmse standard 1.19 25 0.0181 Prepro~
6 0.00000316 1 rsq standard 0.0361 25 0.00422 Prepro~
7 0.000562 1 rmse standard 1.19 25 0.0181 Prepro~
8 0.000562 1 rsq standard 0.0361 25 0.00422 Prepro~
9 0.1 1 rmse standard 1.21 25 0.0177 Prepro~
10 0.1 1 rsq standard NaN 0 NA Prepro~
# i 40 more rows
autoplot(tree_fit)
#select best tree model
<- tree_fit %>%
best_tree select_best(metric = 'rmse')
#final model
<- tree_workflow %>%
final_tree finalize_workflow(best_tree)
#fit final model to data
<- final_tree %>%
tree_fit_final fit(flu_train)
#plot final fit
::rpart.plot(extract_fit_parsnip(tree_fit_final)$fit) rpart.plot
#tree residual plot
<- tree_fit_final %>%
tree_pred fit(flu_train) %>%
predict(flu_train)
<- flu_train$BodyTemp - tree_pred$.pred
tree_resid <- tibble(tree_resid, tree_pred)
tree_df
ggplot(aes(.pred, tree_resid), data = tree_df)+
geom_point()+
labs(x = "Predictions", y = "Residuals")+
geom_hline(yintercept = 0)+
theme_bw()
LASSO
#LASSO model
<- linear_reg(
lasso_model penalty=tune(),mixture=1) %>%
set_engine("glmnet") %>%
set_mode("regression")
#LASSO workflow
<- workflow() %>%
lasso_workflow add_model(lasso_model) %>%
add_recipe(global_recipe)
#LASSO grid
<- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lasso_grid
#highest and lowest penalty values
%>%
lasso_grid top_n(5)
Selecting by penalty
# A tibble: 5 x 1
penalty
<dbl>
1 0.0386
2 0.0489
3 0.0621
4 0.0788
5 0.1
%>%
lasso_grid top_n(-5)
Selecting by penalty
# A tibble: 5 x 1
penalty
<dbl>
1 0.0001
2 0.000127
3 0.000161
4 0.000204
5 0.000259
#model tuning and training
<- lasso_workflow %>%
lasso_train tune_grid(
resamples=folds,
grid=lasso_grid,
control=control_grid(save_pred = TRUE),
metrics=metric_set(rmse))
%>%
lasso_train collect_metrics()
# A tibble: 30 x 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0001 rmse standard 1.18 25 0.0167 Preprocessor1_Model01
2 0.000127 rmse standard 1.18 25 0.0167 Preprocessor1_Model02
3 0.000161 rmse standard 1.18 25 0.0167 Preprocessor1_Model03
4 0.000204 rmse standard 1.18 25 0.0167 Preprocessor1_Model04
5 0.000259 rmse standard 1.18 25 0.0167 Preprocessor1_Model05
6 0.000329 rmse standard 1.18 25 0.0167 Preprocessor1_Model06
7 0.000418 rmse standard 1.18 25 0.0167 Preprocessor1_Model07
8 0.000530 rmse standard 1.18 25 0.0167 Preprocessor1_Model08
9 0.000672 rmse standard 1.18 25 0.0167 Preprocessor1_Model09
10 0.000853 rmse standard 1.18 25 0.0167 Preprocessor1_Model10
# i 20 more rows
autoplot(lasso_train)
#select best model
<- lasso_train %>%
lasso_best select_best(metric = 'rmse')
#finalized lasso
<- lasso_workflow %>%
lasso_final finalize_workflow(lasso_best)
<- lasso_final %>%
lasso_fit fit(flu_train)
<- lasso_final %>%
lasso_pred fit(flu_train) %>%
predict(flu_train)
<- extract_fit_engine(lasso_fit)
plot_fit plot(plot_fit, "lambda")
<- flu_train$BodyTemp - lasso_pred$.pred
lasso_resid <- tibble(lasso_resid, lasso_pred)
lasso_df
ggplot(aes(.pred, lasso_resid), data = lasso_df)+
geom_point()+
labs(x = "Predictions", y = "Residuals")+
geom_hline(yintercept = 0)+
theme_bw()
Random Forest
#create cores object
<- parallel::detectCores()
cores
#rf model
<- rand_forest(mtry=tune(),min_n=tune(),trees=1000) %>%
rf_model set_engine("ranger", num.threads = cores)%>%
set_mode("regression")
#rf workflow
<- workflow() %>%
rf_workflow add_model(rf_model) %>%
add_recipe(global_recipe)
<- rf_workflow %>%
rf_train tune_grid(
resamples=folds,
grid=25,
control=control_grid(save_pred=TRUE),
metrics = metric_set(rmse))
i Creating pre-processing data to finalize unknown parameter: mtry
autoplot(rf_train)
%>% collect_metrics() rf_train
# A tibble: 25 x 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 30 20 rmse standard 1.19 25 0.0168 Preprocessor1_Model01
2 12 32 rmse standard 1.17 25 0.0166 Preprocessor1_Model02
3 26 11 rmse standard 1.21 25 0.0165 Preprocessor1_Model03
4 28 6 rmse standard 1.21 25 0.0165 Preprocessor1_Model04
5 3 7 rmse standard 1.17 25 0.0166 Preprocessor1_Model05
6 16 37 rmse standard 1.17 25 0.0166 Preprocessor1_Model06
7 4 25 rmse standard 1.16 25 0.0167 Preprocessor1_Model07
8 11 27 rmse standard 1.17 25 0.0168 Preprocessor1_Model08
9 15 35 rmse standard 1.17 25 0.0167 Preprocessor1_Model09
10 23 38 rmse standard 1.18 25 0.0168 Preprocessor1_Model10
# i 15 more rows
#select best rf model
<- rf_train %>%
rf_best select_best(metric = 'rmse')
#final rf model
<- rf_workflow %>%
rf_final finalize_workflow(rf_best)
<- rf_final %>%
rf_fit fit(flu_train)
<- rf_final %>%
rf_pred fit(flu_train) %>%
predict(flu_train)
<- flu_train$BodyTemp - rf_pred$.pred
rf_resid <- tibble(rf_resid, rf_pred)
rf_df
ggplot(aes(.pred, rf_resid), data = rf_df)+
geom_point()+
labs(x = "Predictions", y = "Residuals")+
geom_hline(yintercept = 0)+
theme_bw()
Compare model performances
<- null_model %>% show_best(metric = "rmse")
null_rmse # 1.207 null_rmse
# A tibble: 1 x 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 1.21 25 0.0177 Preprocessor1_Model1
<- tree_fit %>% show_best(metric = "rmse", n=1)
tree_rmse # 1.189 tree_rmse
# A tibble: 1 x 8
cost_complexity tree_depth .metric .estimator mean n std_err .config
<dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 1 rmse standard 1.19 25 0.0181 Preprocesso~
<- lasso_train %>% show_best(metric = "rmse", n=1)
lasso_rmse # 1.155 lasso_rmse
# A tibble: 1 x 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0621 rmse standard 1.16 25 0.0170 Preprocessor1_Model28
<- rf_train %>% show_best(metric = "rmse", n=1)
rf_rmse # 1.163 rf_rmse
# A tibble: 1 x 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 7 31 rmse standard 1.16 25 0.0165 Preprocessor1_Model15
The RMSE values for the best performing models were as follows:
Null | Tree | LASSO | Random Forest |
---|---|---|---|
1.207 | 1.189 | 1.155 | 1.163 |
Based on these RMSE values, the LASSO appears to have the best performance. Because of this, this model will be used to fit the test data.
<- lasso_final %>%
final_model last_fit(split = flu_split, metrics = metric_set(rmse))
%>% collect_metrics() final_model
# A tibble: 1 x 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 1.15 Preprocessor1_Model1
#plot of fit
<- final_model %>%
final_pred collect_predictions()
%>%
final_model extract_fit_engine() %>%
vip()
# residual plot
<- final_pred$BodyTemp - final_pred$.pred
final_resid <- tibble(final_resid, final_pred)
final_df
ggplot(aes(.pred, final_resid), data = final_df)+
geom_point()+
labs(x = "Predictions", y = "Residuals")+
geom_hline(yintercept = 0)+
theme_bw()