#load necessary packages
library(tidyverse)
library(tidymodels)
library(kableExtra)
library(here)
library(performance)
Flu Analysis
Fitting
This is the third file of a four-part data analysis exercise, conducted on the dataset from McKay et al 2020, found here. This file contains data analysis and model fitting.
Load Data/Packages
#load and view data
<- readRDS(here::here("fluanalysis", "data", "flu_data_clean.RDS"))
flu_data glimpse(flu_data)
Rows: 730
Columns: 32
$ SwollenLymphNodes <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Y~
$ ChestCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ ChillsSweats <fct> No, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, ~
$ NasalCongestion <fct> No, Yes, Yes, Yes, No, No, No, Yes, Yes, Yes, Yes, Y~
$ CoughYN <fct> Yes, Yes, No, Yes, No, Yes, Yes, Yes, Yes, Yes, No, ~
$ Sneeze <fct> No, No, Yes, Yes, No, Yes, No, Yes, No, No, No, No, ~
$ Fatigue <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ SubjectiveFever <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes~
$ Headache <fct> Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes~
$ Weakness <fct> Mild, Severe, Severe, Severe, Moderate, Moderate, Mi~
$ WeaknessYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ CoughIntensity <fct> Severe, Severe, Mild, Moderate, None, Moderate, Seve~
$ CoughYN2 <fct> Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes~
$ Myalgia <fct> Mild, Severe, Severe, Severe, Mild, Moderate, Mild, ~
$ MyalgiaYN <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Ye~
$ RunnyNose <fct> No, No, Yes, Yes, No, No, Yes, Yes, Yes, Yes, No, No~
$ AbPain <fct> No, No, Yes, No, No, No, No, No, No, No, Yes, Yes, N~
$ ChestPain <fct> No, No, Yes, No, No, Yes, Yes, No, No, No, No, Yes, ~
$ Diarrhea <fct> No, No, No, No, No, Yes, No, No, No, No, No, No, No,~
$ EyePn <fct> No, No, No, No, Yes, No, No, No, No, No, Yes, No, Ye~
$ Insomnia <fct> No, No, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Yes, Y~
$ ItchyEye <fct> No, No, No, No, No, No, No, No, No, No, No, No, Yes,~
$ Nausea <fct> No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes, Yes, Y~
$ EarPn <fct> No, Yes, No, Yes, No, No, No, No, No, No, No, Yes, Y~
$ Hearing <fct> No, Yes, No, No, No, No, No, No, No, No, No, No, No,~
$ Pharyngitis <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, ~
$ Breathless <fct> No, No, Yes, No, No, Yes, No, No, No, Yes, No, Yes, ~
$ ToothPn <fct> No, No, Yes, No, No, No, No, No, Yes, No, No, Yes, N~
$ Vision <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, ~
$ Vomit <fct> No, No, No, No, No, No, Yes, No, No, No, Yes, Yes, N~
$ Wheeze <fct> No, No, No, Yes, No, Yes, No, No, No, No, No, Yes, N~
$ BodyTemp <dbl> 98.3, 100.4, 100.8, 98.8, 100.5, 98.4, 102.5, 98.4, ~
Model Fitting
Body Temperature
First, I want to fit a linear regression model using our first outcome of interest, body temperature, and a single predictor variable, runny nose.
<- linear_reg() %>%
flu_lm1 set_engine("lm") %>%
fit(BodyTemp ~ RunnyNose, data = flu_data)
::kable(tidy(flu_lm1)) %>%
kableExtra::kable_classic(full_width = FALSE, font = "Arial") kableExtra
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 99.1431280 | 0.0819076 | 1210.425819 | 0.00000 |
RunnyNoseYes | -0.2926463 | 0.0971409 | -3.012595 | 0.00268 |
glance(flu_lm1)
# A tibble: 1 x 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.0123 0.0110 1.19 9.08 0.00268 1 -1162. 2329. 2343.
# i 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
Based on the R-squared value and the RMSE, this does not seem to be a great fit to the data. I want to throw in all of the other variables into a global model and see if this does a better job of explaining the data than the first model.
<- linear_reg() %>%
flu_lm2 set_engine("lm") %>%
fit(BodyTemp ~ . , data = flu_data)
::kable(tidy(flu_lm2)) %>%
kableExtra::kable_classic(full_width = FALSE, font = "Arial") kableExtra
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 97.9252434 | 0.3038043 | 322.3300394 | 0.0000000 |
SwollenLymphNodesYes | -0.1653017 | 0.0919592 | -1.7975544 | 0.0726816 |
ChestCongestionYes | 0.0873264 | 0.0975461 | 0.8952326 | 0.3709727 |
ChillsSweatsYes | 0.2012657 | 0.1273016 | 1.5810148 | 0.1143296 |
NasalCongestionYes | -0.2157711 | 0.1137978 | -1.8960920 | 0.0583624 |
CoughYNYes | 0.3138934 | 0.2407384 | 1.3038777 | 0.1927070 |
SneezeYes | -0.3619237 | 0.0982987 | -3.6818764 | 0.0002495 |
FatigueYes | 0.2647620 | 0.1605576 | 1.6490163 | 0.0995962 |
SubjectiveFeverYes | 0.4368372 | 0.1033982 | 4.2248066 | 0.0000271 |
HeadacheYes | 0.0114533 | 0.1254052 | 0.0913306 | 0.9272562 |
WeaknessMild | 0.0182293 | 0.1891690 | 0.0963651 | 0.9232584 |
WeaknessModerate | 0.0989441 | 0.1978635 | 0.5000625 | 0.6171894 |
WeaknessSevere | 0.3734354 | 0.2307665 | 1.6182393 | 0.1060648 |
WeaknessYNYes | NA | NA | NA | NA |
CoughIntensityMild | 0.0848811 | 0.2798780 | 0.3032791 | 0.7617680 |
CoughIntensityModerate | -0.0613837 | 0.3019966 | -0.2032595 | 0.8389917 |
CoughIntensitySevere | -0.0372720 | 0.3140127 | -0.1186957 | 0.9055507 |
CoughYN2Yes | NA | NA | NA | NA |
MyalgiaMild | 0.1642421 | 0.1604984 | 1.0233257 | 0.3065099 |
MyalgiaModerate | -0.0240642 | 0.1678337 | -0.1433810 | 0.8860309 |
MyalgiaSevere | -0.1292631 | 0.2078542 | -0.6218934 | 0.5342159 |
MyalgiaYNYes | NA | NA | NA | NA |
RunnyNoseYes | -0.0804851 | 0.1085262 | -0.7416190 | 0.4585687 |
AbPainYes | 0.0315744 | 0.1402357 | 0.2251524 | 0.8219269 |
ChestPainYes | 0.1050706 | 0.1069797 | 0.9821547 | 0.3263654 |
DiarrheaYes | -0.1568064 | 0.1295451 | -1.2104390 | 0.2265220 |
EyePnYes | 0.1315436 | 0.1297573 | 1.0137665 | 0.3110470 |
InsomniaYes | -0.0068237 | 0.0907966 | -0.0751542 | 0.9401137 |
ItchyEyeYes | -0.0080161 | 0.1101909 | -0.0727470 | 0.9420284 |
NauseaYes | -0.0340655 | 0.1020492 | -0.3338147 | 0.7386200 |
EarPnYes | 0.0937899 | 0.1138747 | 0.8236242 | 0.4104357 |
HearingYes | 0.2322025 | 0.2220428 | 1.0457557 | 0.2960374 |
PharyngitisYes | 0.3175805 | 0.1213416 | 2.6172439 | 0.0090571 |
BreathlessYes | 0.0905257 | 0.0998375 | 0.9067309 | 0.3648634 |
ToothPnYes | -0.0228762 | 0.1137504 | -0.2011084 | 0.8406726 |
VisionYes | -0.2746251 | 0.2776810 | -0.9889948 | 0.3230099 |
VomitYes | 0.1652722 | 0.1514323 | 1.0913937 | 0.2754779 |
WheezeYes | -0.0466649 | 0.1070356 | -0.4359755 | 0.6629899 |
glance(flu_lm2)
# A tibble: 1 x 12
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.129 0.0860 1.14 3.02 0.0000000420 34 -1116. 2304. 2469.
# i 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
Let’s compare these models side-by-side.
compare_performance(flu_lm1, flu_lm2)
# Comparison of Model Performance Indices
Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | R2 | R2 (adj.) | RMSE | Sigma
------------------------------------------------------------------------------------------------------
flu_lm1 | _lm | 2329.3 (<.001) | 2329.4 (<.001) | 2343.1 (>.999) | 0.012 | 0.011 | 1.188 | 1.190
flu_lm2 | _lm | 2303.8 (>.999) | 2307.7 (>.999) | 2469.2 (<.001) | 0.129 | 0.086 | 1.116 | 1.144
Even though it is not a great fit to the data (R-squared = 0.13), the global model is better at explaining the data than the first model based on R-sq and RMSE (as well as AIC).
Nausea
Now, I want to fit a logistic regression model using our second outcome of interest, nausea, with a single predictor variable, runny nose.
<- logistic_reg() %>%
flu_glm1 set_engine("glm") %>%
fit(Nausea ~ RunnyNose, data = flu_data)
::kable(tidy(flu_glm1)) %>%
kableExtra::kable_classic(full_width = FALSE, font = "Arial") kableExtra
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | -0.6578078 | 0.1452003 | -4.5303474 | 0.0000059 |
RunnyNoseYes | 0.0501828 | 0.1718249 | 0.2920578 | 0.7702424 |
glance(flu_glm1)
# A tibble: 1 x 8
null.deviance df.null logLik AIC BIC deviance df.residual nobs
<dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 945. 729 -472. 949. 958. 945. 728 730
Same as before, I want to fit a global model using nausea and all of the other variables in the dataset and compare it to the first model.
<- logistic_reg() %>%
flu_glm2 set_engine("glm") %>%
fit(Nausea ~ . , data = flu_data)
::kable(tidy(flu_glm2)) %>%
kableExtra::kable_classic(full_width = FALSE, font = "Arial") kableExtra
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 0.2228705 | 7.8274090 | 0.0284731 | 0.9772848 |
SwollenLymphNodesYes | -0.2510826 | 0.1960289 | -1.2808444 | 0.2002483 |
ChestCongestionYes | 0.2755537 | 0.2126616 | 1.2957376 | 0.1950659 |
ChillsSweatsYes | 0.2740967 | 0.2878278 | 0.9522940 | 0.3409479 |
NasalCongestionYes | 0.4258174 | 0.2545609 | 1.6727521 | 0.0943761 |
CoughYNYes | -0.1404234 | 0.5187985 | -0.2706705 | 0.7866445 |
SneezeYes | 0.1767237 | 0.2103493 | 0.8401441 | 0.4008276 |
FatigueYes | 0.2290618 | 0.3718816 | 0.6159535 | 0.5379252 |
SubjectiveFeverYes | 0.2777411 | 0.2253628 | 1.2324177 | 0.2177931 |
HeadacheYes | 0.3312592 | 0.2848965 | 1.1627353 | 0.2449369 |
WeaknessMild | -0.1216059 | 0.4468864 | -0.2721182 | 0.7855312 |
WeaknessModerate | 0.3108487 | 0.4544826 | 0.6839618 | 0.4939993 |
WeaknessSevere | 0.8231869 | 0.5104242 | 1.6127505 | 0.1067987 |
WeaknessYNYes | NA | NA | NA | NA |
CoughIntensityMild | -0.2207938 | 0.5843673 | -0.3778339 | 0.7055540 |
CoughIntensityModerate | -0.3626784 | 0.6313701 | -0.5744307 | 0.5656764 |
CoughIntensitySevere | -0.9505442 | 0.6581423 | -1.4442837 | 0.1486592 |
CoughYN2Yes | NA | NA | NA | NA |
MyalgiaMild | -0.0041462 | 0.3680943 | -0.0112640 | 0.9910128 |
MyalgiaModerate | 0.2047429 | 0.3732305 | 0.5485697 | 0.5833008 |
MyalgiaSevere | 0.1207580 | 0.4449265 | 0.2714112 | 0.7860748 |
MyalgiaYNYes | NA | NA | NA | NA |
RunnyNoseYes | 0.0453236 | 0.2326446 | 0.1948189 | 0.8455347 |
AbPainYes | 0.9393036 | 0.2814632 | 3.3372164 | 0.0008462 |
ChestPainYes | 0.0707772 | 0.2278583 | 0.3106192 | 0.7560901 |
DiarrheaYes | 1.0639338 | 0.2587051 | 4.1125354 | 0.0000391 |
EyePnYes | -0.3419910 | 0.2777198 | -1.2314249 | 0.2181640 |
InsomniaYes | 0.0841752 | 0.1929851 | 0.4361747 | 0.6627100 |
ItchyEyeYes | -0.0633645 | 0.2325014 | -0.2725337 | 0.7852117 |
EarPnYes | -0.1817189 | 0.2392074 | -0.7596707 | 0.4474514 |
HearingYes | 0.3230517 | 0.4524022 | 0.7140808 | 0.4751772 |
PharyngitisYes | 0.2753644 | 0.2660588 | 1.0349756 | 0.3006803 |
BreathlessYes | 0.5268015 | 0.2085792 | 2.5256661 | 0.0115479 |
ToothPnYes | 0.4806486 | 0.2294739 | 2.0945674 | 0.0362095 |
VisionYes | 0.1254977 | 0.5411141 | 0.2319247 | 0.8165965 |
VomitYes | 2.4584655 | 0.3486077 | 7.0522411 | 0.0000000 |
WheezeYes | -0.3044348 | 0.2340838 | -1.3005376 | 0.1934168 |
BodyTemp | -0.0312460 | 0.0798381 | -0.3913668 | 0.6955261 |
glance(flu_glm2)
# A tibble: 1 x 8
null.deviance df.null logLik AIC BIC deviance df.residual nobs
<dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 945. 729 -376. 821. 982. 751. 695 730
Let’s see which model is a better fit.
compare_performance(flu_glm1, flu_glm2)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
# Comparison of Model Performance Indices
Name | Model | AIC (weights) | AICc (weights) | BIC (weights) | Tjur's R2 | RMSE | Sigma | Log_loss | Score_log | Score_spherical | PCP
----------------------------------------------------------------------------------------------------------------------------------------------
flu_glm1 | _glm | 948.6 (<.001) | 948.6 (<.001) | 957.8 (>.999) | 1.169e-04 | 0.477 | 1.139 | 0.647 | -107.871 | 0.012 | 0.545
flu_glm2 | _glm | 821.5 (>.999) | 825.1 (>.999) | 982.2 (<.001) | 0.247 | 0.414 | 1.040 | 0.515 | -Inf | 0.002 | 0.658
Once again, the global model was better at explaining the data than the univariate model.