Predict weight from height. See Chapters 3, 9 and 10 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
earnings <- read.csv(root("Earnings/data","earnings.csv"))
head(earnings)
height weight male earn earnk ethnicity education mother_education
1 74 210 1 50000 50 White 16 16
2 66 125 0 60000 60 White 16 16
3 64 126 0 30000 30 White 16 16
4 65 200 0 25000 25 White 17 17
5 63 110 0 50000 50 Other 16 16
6 68 165 0 62000 62 Black 18 18
father_education walk exercise smokenow tense angry age
1 16 3 3 2 0 0 45
2 16 6 5 1 0 0 58
3 16 8 1 2 1 1 29
4 NA 8 1 2 0 0 57
5 16 5 6 2 0 0 91
6 18 1 1 2 2 2 54
The option refresh = 0
supresses the default Stan sampling progress output. This is useful for small data with fast computation. For more complex models and bigger data, it can be useful to see the progress.
fit_1 <- stan_glm(weight ~ height, data=earnings, refresh = 0)
print(fit_1)
stan_glm
family: gaussian [identity]
formula: weight ~ height
observations: 1789
predictors: 2
------
Median MAD_SD
(Intercept) -173.1 11.8
height 4.9 0.2
Auxiliary parameter(s):
Median MAD_SD
sigma 29.0 0.5
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
**Predict weight for 66 inches person
coefs_1 <- coef(fit_1)
predicted_1 <- coefs_1[1] + coefs_1[2]*66
round(predicted_1, 1)
(Intercept)
153.3
or
new <- data.frame(height=66)
pred <- posterior_predict(fit_1, newdata=new)
cat("Predicted weight for a 66-inch-tall person is", round(mean(pred)),
"pounds with a sd of", round(sd(pred)), "\n")
Predicted weight for a 66-inch-tall person is 152 pounds with a sd of 29
earnings$c_height <- earnings$height - 66
fit_2 <- stan_glm(weight ~ c_height, data=earnings, refresh = 0)
print(fit_2)
stan_glm
family: gaussian [identity]
formula: weight ~ c_height
observations: 1789
predictors: 2
------
Median MAD_SD
(Intercept) 153.4 0.7
c_height 5.0 0.2
Auxiliary parameter(s):
Median MAD_SD
sigma 29.0 0.5
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
new <- data.frame(c_height=4.0)
point_pred_2 <- predict(fit_2, newdata=new)
round(point_pred_2, 1)
1
173.2
variation coming from posterior uncertainty in the coefficients
linpred_2 <- posterior_linpred(fit_2, newdata=new)
hist(linpred_2)
variation coming from posterior uncertainty in the coefficients and predictive uncertainty
postpred_2 <- posterior_predict(fit_2, newdata=new)
hist(postpred_2)
new <- data.frame(height=66)
pred <- posterior_predict(fit_1, newdata=new)
cat("Predicted weight for a 66-inch-tall person is", round(mean(pred)), "pounds with a sd of", round(sd(pred)), "\n")
Predicted weight for a 66-inch-tall person is 154 pounds with a sd of 29
fit_3 <- stan_glm(weight ~ c_height + male, data=earnings, refresh = 0)
print(fit_3)
stan_glm
family: gaussian [identity]
formula: weight ~ c_height + male
observations: 1789
predictors: 3
------
Median MAD_SD
(Intercept) 149.5 1.0
c_height 3.9 0.2
male 11.9 2.0
Auxiliary parameter(s):
Median MAD_SD
sigma 28.7 0.5
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
new <- data.frame(c_height=4, male=0)
pred <- posterior_predict(fit_3, newdata=new)
cat("Predicted weight for a 70-inch-tall female is", round(mean(pred)), "pounds with a sd of", round(sd(pred)), "\n")
Predicted weight for a 70-inch-tall female is 165 pounds with a sd of 29
Include ethnicity in the regression as a factor
fit_4 <- stan_glm(weight ~ c_height + male + factor(ethnicity),
data=earnings, refresh = 0)
print(fit_4)
stan_glm
family: gaussian [identity]
formula: weight ~ c_height + male + factor(ethnicity)
observations: 1789
predictors: 6
------
Median MAD_SD
(Intercept) 154.4 2.2
c_height 3.8 0.2
male 12.1 2.0
factor(ethnicity)Hispanic -6.2 3.5
factor(ethnicity)Other -12.3 5.3
factor(ethnicity)White -5.1 2.3
Auxiliary parameter(s):
Median MAD_SD
sigma 28.6 0.5
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
earnings$eth <- factor(earnings$ethnicity,
levels=c("White", "Black", "Hispanic", "Other"))
fit_5 <- stan_glm(weight ~ c_height + male + ethnicity, data=earnings, refresh = 0)
print(fit_5)
stan_glm
family: gaussian [identity]
formula: weight ~ c_height + male + ethnicity
observations: 1789
predictors: 6
------
Median MAD_SD
(Intercept) 154.3 2.3
c_height 3.9 0.3
male 12.1 2.0
ethnicityHispanic -6.2 3.7
ethnicityOther -12.2 5.4
ethnicityWhite -5.2 2.4
Auxiliary parameter(s):
Median MAD_SD
sigma 28.7 0.5
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
earnings$eth_white <- ifelse(earnings$ethnicity=="White", 1, 0)
earnings$eth_black <- ifelse(earnings$ethnicity=="Black", 1, 0)
earnings$eth_hispanic <- ifelse(earnings$ethnicity=="Hispanic", 1, 0)
earnings$eth_other <- ifelse(earnings$ethnicity=="Other", 1, 0)
fit_6 <- stan_glm(weight ~ c_height + male + eth_black + eth_hispanic + eth_other,
data=earnings, refresh = 0)
print(fit_6)
stan_glm
family: gaussian [identity]
formula: weight ~ c_height + male + eth_black + eth_hispanic + eth_other
observations: 1789
predictors: 6
------
Median MAD_SD
(Intercept) 149.1 1.0
c_height 3.9 0.3
male 12.1 2.1
eth_black 5.2 2.3
eth_hispanic -1.0 2.9
eth_other -7.0 5.0
Auxiliary parameter(s):
Median MAD_SD
sigma 28.6 0.5
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg