Flu Analysis

Model Evaluation

Author

Seth Lattner

This is the final file of a four-part data analysis exercise, conducted on the dataset from McKay et al 2020, found here. This file contains model fitting and model performance evaluation.

Load Data/Packages

#load necessary packages
library(tidyverse)
library(tidymodels)
library(here)
library(performance)
library(dplyr)
library(yardstick)

#load and view data
flu_data <- readRDS(here::here("fluanalysis", "data", "flu_data_clean.RDS"))
glimpse(flu_data)

Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y~
$ ChestCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ ChillsSweats      <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, ~
$ NasalCongestion   <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ CoughYN           <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, ~
$ Sneeze            <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, ~
$ Fatigue           <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ SubjectiveFever   <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes~
$ Headache          <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes~
$ Weakness          <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi~
$ WeaknessYN        <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ CoughIntensity    <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve~
$ CoughYN2          <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes~
$ Myalgia           <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, ~
$ MyalgiaYN         <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ RunnyNose         <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No~
$ AbPain            <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N~
$ ChestPain         <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, ~
$ Diarrhea          <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,~
$ EyePn             <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye~
$ Insomnia          <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y~
$ ItchyEye          <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,~
$ Nausea            <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y~
$ EarPn             <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y~
$ Hearing           <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,~
$ Pharyngitis       <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, ~
$ Breathless        <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, ~
$ ToothPn           <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N~
$ Vision            <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, ~
$ Vomit             <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N~
$ Wheeze            <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N~
$ BodyTemp          <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, ~

Multivariate Model

Create training and testing datasets from the original data.

#Fix random numbers by setting seed
set.seed(1999)
# Put 3/4 of the data into the training set 
data_split <- initial_split(flu_data, prop = 3/4)

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

Create recipe and set engine

#create recipe
flu_rec <- recipe(Nausea ~ ., data = train_data)

#set engine
log_reg <- 
  logistic_reg() %>%
  set_engine("glm")

Create workflow and fit the training data

#create workflow
flu_flow <- 
  workflow() %>%
  add_model(log_reg) %>%
  add_recipe(flu_rec)

#fit to training data
flu_fit <-
  flu_flow %>%
  fit(data = train_data)

Extract model fit

flu_fit %>%
  extract_fit_parsnip() %>% 
  tidy()

# A tibble: 38 x 5
   term                 estimate std.error statistic p.value
   <chr>                   <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)            8.31       9.74      0.853  0.393 
 2 SwollenLymphNodesYes  -0.279      0.226    -1.23   0.218 
 3 ChestCongestionYes     0.477      0.252     1.90   0.0580
 4 ChillsSweatsYes        0.406      0.338     1.20   0.230 
 5 NasalCongestionYes     0.568      0.309     1.84   0.0657
 6 CoughYNYes            -0.218      0.575    -0.380  0.704 
 7 SneezeYes              0.0698     0.248     0.281  0.779 
 8 FatigueYes             0.394      0.442     0.891  0.373 
 9 SubjectiveFeverYes     0.430      0.261     1.65   0.0997
10 HeadacheYes            0.441      0.346     1.27   0.203 
# i 28 more rows

Fit testing data using model created from training data

predict(flu_fit, test_data)

Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading

# A tibble: 183 x 1
   .pred_class
   <fct>      
 1 No         
 2 Yes        
 3 No         
 4 No         
 5 No         
 6 No         
 7 Yes        
 8 No         
 9 No         
10 Yes        
# i 173 more rows

flu_aug <- 
  augment(flu_fit, test_data)

Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading

Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading

flu_aug %>%
  select(Nausea, .pred_No, .pred_Yes)

# A tibble: 183 x 3
   Nausea .pred_No .pred_Yes
   <fct>     <dbl>     <dbl>
 1 No        0.711     0.289
 2 Yes       0.105     0.895
 3 Yes       0.622     0.378
 4 Yes       0.675     0.325
 5 Yes       0.606     0.394
 6 No        0.664     0.336
 7 No        0.448     0.552
 8 No        0.665     0.335
 9 No        0.617     0.383
10 Yes       0.462     0.538
# i 173 more rows

flu_aug %>%
  roc_curve(truth = Nausea, .pred_No) %>%
  autoplot()

Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
i Please use `reframe()` instead.
i When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.
i The deprecated feature was likely used in the yardstick package.
  Please report the issue at <https://github.com/tidymodels/yardstick/issues>.

flu_aug %>%
  roc_auc(truth = Nausea, .pred_No)

# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.723

Univariate Model

Create recipe and set engine

#create recipe
flu_rec2 <- recipe(Nausea ~ RunnyNose, data = train_data)

Create workflow and fit the training data

#create workflow
flu_flow2 <- 
  workflow() %>%
  add_model(log_reg) %>%
  add_recipe(flu_rec2)

#fit to training data
flu_fit2 <-
  flu_flow2 %>%
  fit(data = train_data)

Extract model fit

flu_fit2 %>%
  extract_fit_parsnip() %>% 
  tidy()

# A tibble: 2 x 5
  term         estimate std.error statistic  p.value
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)    -0.582     0.165    -3.52  0.000433
2 RunnyNoseYes   -0.104     0.197    -0.525 0.600

Fit testing data using model created from training data

predict(flu_fit2, test_data)

# A tibble: 183 x 1
   .pred_class
   <fct>      
 1 No         
 2 No         
 3 No         
 4 No         
 5 No         
 6 No         
 7 No         
 8 No         
 9 No         
10 No         
# i 173 more rows

flu_aug2 <- 
  augment(flu_fit2, test_data)

flu_aug2 %>%
  select(Nausea, .pred_No, .pred_Yes)

# A tibble: 183 x 3
   Nausea .pred_No .pred_Yes
   <fct>     <dbl>     <dbl>
 1 No        0.642     0.358
 2 Yes       0.665     0.335
 3 Yes       0.642     0.358
 4 Yes       0.665     0.335
 5 Yes       0.665     0.335
 6 No        0.665     0.335
 7 No        0.642     0.358
 8 No        0.642     0.358
 9 No        0.642     0.358
10 Yes       0.665     0.335
# i 173 more rows

flu_aug2 %>%
  roc_curve(truth = Nausea, .pred_No) %>%
  autoplot()

flu_aug2 %>%
  roc_auc(truth = Nausea, .pred_No)

# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.449

This section added by Raquel Francisco

Working with continous variable (BodyTemp)

##Will use as much as premade data/code as possible

#create recipe
flu_rec3 <- recipe(BodyTemp ~ ., data = train_data)

#reset engine
lm_reg <- 
  linear_reg() %>%
  set_engine("lm")

#create workflow
flu_flow3 <- 
  workflow() %>%
  add_model(lm_reg) %>%
  add_recipe(flu_rec3)

#fit to training data
flu_fit3 <-
  flu_flow3 %>%
  fit(data = train_data)

flu_fit3 %>% 
  extract_fit_parsnip() %>% 
  tidy()

# A tibble: 38 x 5
   term                 estimate std.error statistic  p.value
   <chr>                   <dbl>     <dbl>     <dbl>    <dbl>
 1 (Intercept)          98.3         0.343  286.     0       
 2 SwollenLymphNodesYes -0.135       0.102   -1.33   0.186   
 3 ChestCongestionYes    0.0318      0.110    0.289  0.772   
 4 ChillsSweatsYes       0.221       0.142    1.55   0.121   
 5 NasalCongestionYes   -0.276       0.129   -2.14   0.0332  
 6 CoughYNYes            0.345       0.256    1.35   0.178   
 7 SneezeYes            -0.401       0.111   -3.61   0.000336
 8 FatigueYes            0.207       0.179    1.15   0.249   
 9 SubjectiveFeverYes    0.432       0.114    3.78   0.000178
10 HeadacheYes          -0.00791     0.143   -0.0553 0.956   
# i 28 more rows

#Use a trained workflow to predict
predict(flu_fit3, train_data)

Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading

# A tibble: 547 x 1
   .pred
   <dbl>
 1  98.2
 2  99.1
 3  99.5
 4  98.7
 5  98.8
 6  98.6
 7  99.6
 8  98.5
 9  98.5
10  99.2
# i 537 more rows

#On trained data
flu_aug3train <- 
  augment(flu_fit3, train_data)

Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading

flu_aug3train %>%
  select(BodyTemp, .pred)

# A tibble: 547 x 2
   BodyTemp .pred
      <dbl> <dbl>
 1     97.4  98.2
 2    100.   99.1
 3    102.   99.5
 4     98.7  98.7
 5     98.8  98.8
 6     99.9  98.6
 7     98.7  99.6
 8     98.7  98.5
 9     99.3  98.5
10     99.4  99.2
# i 537 more rows

#flu_aug3train %>%
#  rmse(BodyTemp , .pred)#Unable to get this to run!)

#On test data
flu_aug3test <- 
  augment(flu_fit3, test_data)

Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading

flu_aug3test %>%
  select(BodyTemp, .pred)

# A tibble: 183 x 2
   BodyTemp .pred
      <dbl> <dbl>
 1    100.   99.4
 2    101.   98.7
 3     98.4  99.2
 4     98.1  98.2
 5     98.2  98.4
 6    100.   99.1
 7     99.5  99.4
 8    101.   98.9
 9     98.8  99.4
10     98.1  99.4
# i 173 more rows

#flu_aug3test %>%
#  rmse(BodyTemp , .pred) #Unable to get this to run!

Univariate Model

#create recipe
flu_rec4 <- recipe(BodyTemp ~ Myalgia, data = train_data)

#create workflow
flu_flow4 <- 
  workflow() %>%
  add_model(lm_reg) %>%
  add_recipe(flu_rec4)

#fit to training data
flu_fit4 <-
  flu_flow4 %>%
  fit(data = train_data)

flu_fit4 %>% 
  extract_fit_parsnip() %>% 
  tidy()

# A tibble: 4 x 5
  term            estimate std.error statistic p.value
  <chr>              <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)       98.6       0.151    652.    0     
2 MyalgiaMild        0.278     0.176      1.58  0.115 
3 MyalgiaModerate    0.349     0.169      2.06  0.0400
4 MyalgiaSevere      0.444     0.199      2.23  0.0261

#Use a trained workflow to predict
predict(flu_fit4, train_data)

# A tibble: 547 x 1
   .pred
   <dbl>
 1  98.9
 2  98.9
 3  98.9
 4  98.9
 5  98.9
 6  98.9
 7  98.9
 8  99.0
 9  98.9
10  98.9
# i 537 more rows

#On trained data
flu_aug4train <- 
  augment(flu_fit4, train_data)

flu_aug4train %>%
  select(BodyTemp, .pred)

# A tibble: 547 x 2
   BodyTemp .pred
      <dbl> <dbl>
 1     97.4  98.9
 2    100.   98.9
 3    102.   98.9
 4     98.7  98.9
 5     98.8  98.9
 6     99.9  98.9
 7     98.7  98.9
 8     98.7  99.0
 9     99.3  98.9
10     99.4  98.9
# i 537 more rows

#flu_aug4train %>%
#  rmse(BodyTemp , .pred)#Unable to get this to run!

#On test data
flu_aug4test <- 
  augment(flu_fit4, test_data)

flu_aug4test %>%
  select(BodyTemp, .pred)

# A tibble: 183 x 2
   BodyTemp .pred
      <dbl> <dbl>
 1    100.   99.0
 2    101.   99.0
 3     98.4  98.9
 4     98.1  98.6
 5     98.2  98.9
 6    100.   98.9
 7     99.5  98.9
 8    101.   99.0
 9     98.8  98.6
10     98.1  98.9
# i 173 more rows

#flu_aug4test %>%
#  rmse(BodyTemp , .pred) #Unable to get this to run!

Unable to get RMSE to work after troubleshooting via several help pages.