#load necessary packages
library(tidyverse)
library(tidymodels)
library(here)
library(performance)
library(dplyr)
library(yardstick)
Flu Analysis
Model Evaluation
This is the final file of a four-part data analysis exercise, conducted on the dataset from McKay et al 2020, found here. This file contains model fitting and model performance evaluation.
Load Data/Packages
#load and view data
<- readRDS(here::here("fluanalysis", "data", "flu_data_clean.RDS"))
flu_data glimpse(flu_data)
Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y~
$ ChestCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ ChillsSweats <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, ~
$ NasalCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ CoughYN <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, ~
$ Sneeze <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, ~
$ Fatigue <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ SubjectiveFever <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes~
$ Headache <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes~
$ Weakness <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi~
$ WeaknessYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ CoughIntensity <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve~
$ CoughYN2 <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes~
$ Myalgia <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, ~
$ MyalgiaYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ RunnyNose <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No~
$ AbPain <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N~
$ ChestPain <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, ~
$ Diarrhea <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,~
$ EyePn <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye~
$ Insomnia <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y~
$ ItchyEye <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,~
$ Nausea <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y~
$ EarPn <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y~
$ Hearing <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,~
$ Pharyngitis <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, ~
$ Breathless <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, ~
$ ToothPn <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N~
$ Vision <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, ~
$ Vomit <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N~
$ Wheeze <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N~
$ BodyTemp <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, ~
Multivariate Model
Create training and testing datasets from the original data.
#Fix random numbers by setting seed
set.seed(1999)
# Put 3/4 of the data into the training set
<- initial_split(flu_data, prop = 3/4)
data_split
# Create data frames for the two sets:
<- training(data_split)
train_data <- testing(data_split) test_data
Create recipe and set engine
#create recipe
<- recipe(Nausea ~ ., data = train_data)
flu_rec
#set engine
<-
log_reg logistic_reg() %>%
set_engine("glm")
Create workflow and fit the training data
#create workflow
<-
flu_flow workflow() %>%
add_model(log_reg) %>%
add_recipe(flu_rec)
#fit to training data
<-
flu_fit %>%
flu_flow fit(data = train_data)
Extract model fit
%>%
flu_fit extract_fit_parsnip() %>%
tidy()
# A tibble: 38 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 8.31 9.74 0.853 0.393
2 SwollenLymphNodesYes -0.279 0.226 -1.23 0.218
3 ChestCongestionYes 0.477 0.252 1.90 0.0580
4 ChillsSweatsYes 0.406 0.338 1.20 0.230
5 NasalCongestionYes 0.568 0.309 1.84 0.0657
6 CoughYNYes -0.218 0.575 -0.380 0.704
7 SneezeYes 0.0698 0.248 0.281 0.779
8 FatigueYes 0.394 0.442 0.891 0.373
9 SubjectiveFeverYes 0.430 0.261 1.65 0.0997
10 HeadacheYes 0.441 0.346 1.27 0.203
# i 28 more rows
Fit testing data using model created from training data
predict(flu_fit, test_data)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
# A tibble: 183 x 1
.pred_class
<fct>
1 No
2 Yes
3 No
4 No
5 No
6 No
7 Yes
8 No
9 No
10 Yes
# i 173 more rows
<-
flu_aug augment(flu_fit, test_data)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
%>%
flu_aug select(Nausea, .pred_No, .pred_Yes)
# A tibble: 183 x 3
Nausea .pred_No .pred_Yes
<fct> <dbl> <dbl>
1 No 0.711 0.289
2 Yes 0.105 0.895
3 Yes 0.622 0.378
4 Yes 0.675 0.325
5 Yes 0.606 0.394
6 No 0.664 0.336
7 No 0.448 0.552
8 No 0.665 0.335
9 No 0.617 0.383
10 Yes 0.462 0.538
# i 173 more rows
%>%
flu_aug roc_curve(truth = Nausea, .pred_No) %>%
autoplot()
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
i Please use `reframe()` instead.
i When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
i The deprecated feature was likely used in the yardstick package.
Please report the issue at <https://github.com/tidymodels/yardstick/issues>.
%>%
flu_aug roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.723
Univariate Model
Create recipe and set engine
#create recipe
<- recipe(Nausea ~ RunnyNose, data = train_data) flu_rec2
Create workflow and fit the training data
#create workflow
<-
flu_flow2 workflow() %>%
add_model(log_reg) %>%
add_recipe(flu_rec2)
#fit to training data
<-
flu_fit2 %>%
flu_flow2 fit(data = train_data)
Extract model fit
%>%
flu_fit2 extract_fit_parsnip() %>%
tidy()
# A tibble: 2 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.582 0.165 -3.52 0.000433
2 RunnyNoseYes -0.104 0.197 -0.525 0.600
Fit testing data using model created from training data
predict(flu_fit2, test_data)
# A tibble: 183 x 1
.pred_class
<fct>
1 No
2 No
3 No
4 No
5 No
6 No
7 No
8 No
9 No
10 No
# i 173 more rows
<-
flu_aug2 augment(flu_fit2, test_data)
%>%
flu_aug2 select(Nausea, .pred_No, .pred_Yes)
# A tibble: 183 x 3
Nausea .pred_No .pred_Yes
<fct> <dbl> <dbl>
1 No 0.642 0.358
2 Yes 0.665 0.335
3 Yes 0.642 0.358
4 Yes 0.665 0.335
5 Yes 0.665 0.335
6 No 0.665 0.335
7 No 0.642 0.358
8 No 0.642 0.358
9 No 0.642 0.358
10 Yes 0.665 0.335
# i 173 more rows
%>%
flu_aug2 roc_curve(truth = Nausea, .pred_No) %>%
autoplot()
%>%
flu_aug2 roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 x 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.449
This section added by Raquel Francisco
Working with continous variable (BodyTemp)
##Will use as much as premade data/code as possible
#create recipe
<- recipe(BodyTemp ~ ., data = train_data)
flu_rec3
#reset engine
<-
lm_reg linear_reg() %>%
set_engine("lm")
#create workflow
<-
flu_flow3 workflow() %>%
add_model(lm_reg) %>%
add_recipe(flu_rec3)
#fit to training data
<-
flu_fit3 %>%
flu_flow3 fit(data = train_data)
%>%
flu_fit3 extract_fit_parsnip() %>%
tidy()
# A tibble: 38 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 98.3 0.343 286. 0
2 SwollenLymphNodesYes -0.135 0.102 -1.33 0.186
3 ChestCongestionYes 0.0318 0.110 0.289 0.772
4 ChillsSweatsYes 0.221 0.142 1.55 0.121
5 NasalCongestionYes -0.276 0.129 -2.14 0.0332
6 CoughYNYes 0.345 0.256 1.35 0.178
7 SneezeYes -0.401 0.111 -3.61 0.000336
8 FatigueYes 0.207 0.179 1.15 0.249
9 SubjectiveFeverYes 0.432 0.114 3.78 0.000178
10 HeadacheYes -0.00791 0.143 -0.0553 0.956
# i 28 more rows
#Use a trained workflow to predict
predict(flu_fit3, train_data)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# A tibble: 547 x 1
.pred
<dbl>
1 98.2
2 99.1
3 99.5
4 98.7
5 98.8
6 98.6
7 99.6
8 98.5
9 98.5
10 99.2
# i 537 more rows
#On trained data
<-
flu_aug3train augment(flu_fit3, train_data)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
%>%
flu_aug3train select(BodyTemp, .pred)
# A tibble: 547 x 2
BodyTemp .pred
<dbl> <dbl>
1 97.4 98.2
2 100. 99.1
3 102. 99.5
4 98.7 98.7
5 98.8 98.8
6 99.9 98.6
7 98.7 99.6
8 98.7 98.5
9 99.3 98.5
10 99.4 99.2
# i 537 more rows
#flu_aug3train %>%
# rmse(BodyTemp , .pred)#Unable to get this to run!)
#On test data
<-
flu_aug3test augment(flu_fit3, test_data)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
%>%
flu_aug3test select(BodyTemp, .pred)
# A tibble: 183 x 2
BodyTemp .pred
<dbl> <dbl>
1 100. 99.4
2 101. 98.7
3 98.4 99.2
4 98.1 98.2
5 98.2 98.4
6 100. 99.1
7 99.5 99.4
8 101. 98.9
9 98.8 99.4
10 98.1 99.4
# i 173 more rows
#flu_aug3test %>%
# rmse(BodyTemp , .pred) #Unable to get this to run!
Univariate Model
#create recipe
<- recipe(BodyTemp ~ Myalgia, data = train_data)
flu_rec4
#create workflow
<-
flu_flow4 workflow() %>%
add_model(lm_reg) %>%
add_recipe(flu_rec4)
#fit to training data
<-
flu_fit4 %>%
flu_flow4 fit(data = train_data)
%>%
flu_fit4 extract_fit_parsnip() %>%
tidy()
# A tibble: 4 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 98.6 0.151 652. 0
2 MyalgiaMild 0.278 0.176 1.58 0.115
3 MyalgiaModerate 0.349 0.169 2.06 0.0400
4 MyalgiaSevere 0.444 0.199 2.23 0.0261
#Use a trained workflow to predict
predict(flu_fit4, train_data)
# A tibble: 547 x 1
.pred
<dbl>
1 98.9
2 98.9
3 98.9
4 98.9
5 98.9
6 98.9
7 98.9
8 99.0
9 98.9
10 98.9
# i 537 more rows
#On trained data
<-
flu_aug4train augment(flu_fit4, train_data)
%>%
flu_aug4train select(BodyTemp, .pred)
# A tibble: 547 x 2
BodyTemp .pred
<dbl> <dbl>
1 97.4 98.9
2 100. 98.9
3 102. 98.9
4 98.7 98.9
5 98.8 98.9
6 99.9 98.9
7 98.7 98.9
8 98.7 99.0
9 99.3 98.9
10 99.4 98.9
# i 537 more rows
#flu_aug4train %>%
# rmse(BodyTemp , .pred)#Unable to get this to run!
#On test data
<-
flu_aug4test augment(flu_fit4, test_data)
%>%
flu_aug4test select(BodyTemp, .pred)
# A tibble: 183 x 2
BodyTemp .pred
<dbl> <dbl>
1 100. 99.0
2 101. 99.0
3 98.4 98.9
4 98.1 98.6
5 98.2 98.9
6 100. 98.9
7 99.5 98.9
8 101. 99.0
9 98.8 98.6
10 98.1 98.9
# i 173 more rows
#flu_aug4test %>%
# rmse(BodyTemp , .pred) #Unable to get this to run!
Unable to get RMSE to work after troubleshooting via several help pages.