Linear regression with multiple predictors. See Chapters 10, 11 and 12 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))
library("foreign")
kidiq <- read.csv(root("KidIQ/data","kidiq.csv"))
head(kidiq)
kid_score mom_hs mom_iq mom_work mom_age
1 65 1 121.11753 4 27
2 98 1 89.36188 4 25
3 85 1 115.44316 4 27
4 83 1 99.44964 3 25
5 115 1 92.74571 4 27
6 98 0 107.90184 1 18
The option refresh = 0
supresses the default Stan sampling progress output. This is useful for small data with fast computation. For more complex models and bigger data, it can be useful to see the progress.
fit_1 <- stan_glm(kid_score ~ mom_hs, data=kidiq, refresh = 0)
print(fit_1)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs
observations: 434
predictors: 2
------
Median MAD_SD
(Intercept) 77.6 2.0
mom_hs 11.8 2.3
Auxiliary parameter(s):
Median MAD_SD
sigma 19.9 0.7
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
fit_2 <- stan_glm(kid_score ~ mom_iq, data=kidiq, refresh = 0)
print(fit_2)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_iq
observations: 434
predictors: 2
------
Median MAD_SD
(Intercept) 25.6 5.9
mom_iq 0.6 0.1
Auxiliary parameter(s):
Median MAD_SD
sigma 18.3 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
plot(kidiq$mom_iq, kidiq$kid_score, xlab="Mother IQ score", ylab="Child test score")
abline(coef(fit_2))
# or
#abline(coef(fit_2)[1], coef(fit_2)[2])
# or
#curve(cbind(1,x) %*% coef(fit_2), add=TRUE)
ggplot(kidiq, aes(mom_iq, kid_score)) +
geom_point() +
geom_abline(intercept = coef(fit_2)[1], slope = coef(fit_2)[2]) +
labs(x = "Mother IQ score", y = "Child test score")
fit_3 <- stan_glm(kid_score ~ mom_hs + mom_iq, data=kidiq, refresh = 0)
print(fit_3)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs + mom_iq
observations: 434
predictors: 3
------
Median MAD_SD
(Intercept) 25.6 6.0
mom_hs 5.9 2.2
mom_iq 0.6 0.1
Auxiliary parameter(s):
Median MAD_SD
sigma 18.1 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
summary(fit_3)
Model Info:
function: stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs + mom_iq
algorithm: sampling
sample: 4000 (posterior sample size)
priors: see help('prior_summary')
observations: 434
predictors: 3
Estimates:
mean sd 10% 50% 90%
(Intercept) 25.6 5.8 18.2 25.6 33.0
mom_hs 5.9 2.3 3.0 5.9 8.9
mom_iq 0.6 0.1 0.5 0.6 0.6
sigma 18.2 0.6 17.4 18.1 19.0
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 86.8 1.2 85.2 86.8 88.4
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.1 1.0 5102
mom_hs 0.0 1.0 4522
mom_iq 0.0 1.0 4551
sigma 0.0 1.0 4701
mean_PPD 0.0 1.0 4268
log-posterior 0.0 1.0 1749
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
colors <- ifelse(kidiq$mom_hs==1, "black", "gray")
plot(kidiq$mom_iq, kidiq$kid_score,
xlab="Mother IQ score", ylab="Child test score", col=colors, pch=20)
b_hat <- coef(fit_3)
abline(b_hat[1] + b_hat[2], b_hat[3], col="black")
abline(b_hat[1], b_hat[3], col="gray")
ggplot(kidiq, aes(mom_iq, kid_score)) +
geom_point(aes(color = factor(mom_hs)), show.legend = FALSE) +
geom_abline(
intercept = c(coef(fit_3)[1], coef(fit_3)[1] + coef(fit_3)[2]),
slope = coef(fit_3)[3],
color = c("gray", "black")) +
scale_color_manual(values = c("gray", "black")) +
labs(x = "Mother IQ score", y = "Child test score")
fit_4 <- stan_glm(kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq, data=kidiq,
refresh = 0)
print(fit_4)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq
observations: 434
predictors: 4
------
Median MAD_SD
(Intercept) -9.7 13.5
mom_hs 49.2 15.2
mom_iq 1.0 0.1
mom_hs:mom_iq -0.5 0.2
Auxiliary parameter(s):
Median MAD_SD
sigma 18.0 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
colors <- ifelse(kidiq$mom_hs==1, "black", "gray")
plot(kidiq$mom_iq, kidiq$kid_score,
xlab="Mother IQ score", ylab="Child test score", col=colors, pch=20)
b_hat <- coef(fit_4)
abline(b_hat[1] + b_hat[2], b_hat[3] + b_hat[4], col="black")
abline(b_hat[1], b_hat[3], col="gray")
ggplot(kidiq, aes(mom_iq, kid_score)) +
geom_point(aes(color = factor(mom_hs)), show.legend = FALSE) +
geom_abline(
intercept = c(coef(fit_4)[1], sum(coef(fit_4)[1:2])),
slope = c(coef(fit_4)[3], sum(coef(fit_4)[3:4])),
color = c("gray", "black")) +
scale_color_manual(values = c("gray", "black")) +
labs(x = "Mother IQ score", y = "Child test score")
print(fit_2)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_iq
observations: 434
predictors: 2
------
Median MAD_SD
(Intercept) 25.6 5.9
mom_iq 0.6 0.1
Auxiliary parameter(s):
Median MAD_SD
sigma 18.3 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
sims_2 <- as.matrix(fit_2)
n_sims_2 <- nrow(sims_2)
subset <- sample(n_sims_2, 10)
plot(kidiq$mom_iq, kidiq$kid_score,
xlab="Mother IQ score", ylab="Child test score")
for (i in subset){
abline(sims_2[i,1], sims_2[i,2], col="gray")
}
abline(coef(fit_2)[1], coef(fit_2)[2], col="black")
ggplot version
ggplot(kidiq, aes(mom_iq, kid_score)) +
geom_point() +
geom_abline(
intercept = sims_2[subset, 1],
slope = sims_2[subset, 2],
color = "gray",
size = 0.25) +
geom_abline(
intercept = coef(fit_2)[1],
slope = coef(fit_2)[2],
size = 0.75) +
labs(x = "Mother IQ score", y = "Child test score")
sims_3 <- as.matrix(fit_3)
n_sims_3 <- nrow(sims_3)
par(mar=c(3,3,1,3), mgp=c(1.7, .5, 0), tck=-.01)
par(mfrow=c(1,2))
plot(kidiq$mom_iq, kidiq$kid_score, xlab="Mother IQ score", ylab="Child test score", bty="l", pch=20, xaxt="n", yaxt="n")
axis(1, seq(80, 140, 20))
axis(2, seq(20, 140, 40))
mom_hs_bar <- mean(kidiq$mom_hs)
subset <- sample(n_sims_3, 10)
for (i in subset){
curve(cbind(1, mom_hs_bar, x) %*% sims_3[i,1:3], lwd=.5,
col="gray", add=TRUE)
}
curve(cbind(1, mom_hs_bar, x) %*% coef(fit_3), col="black", add=TRUE)
jitt <- runif(nrow(kidiq), -.03, .03)
plot(kidiq$mom_hs + jitt, kidiq$kid_score, xlab="Mother completed high school", ylab="Child test score", bty="l", pch=20, xaxt="n", yaxt="n")
axis(1, c(0,1))
axis(2, seq(20, 140, 40))
mom_iq_bar <- mean(kidiq$mom_iq)
for (i in subset){
curve(cbind(1, x, mom_iq_bar) %*% sims_3[i,1:3], lwd=.5,
col="gray", add=TRUE)
}
curve(cbind(1, x, mom_iq_bar) %*% coef(fit_3), col="black", add=TRUE)
kidiq$c_mom_hs <- kidiq$mom_hs - mean(kidiq$mom_hs)
kidiq$c_mom_iq <- kidiq$mom_iq - mean(kidiq$mom_iq)
fit_4c <- stan_glm(kid_score ~ c_mom_hs + c_mom_iq + c_mom_hs:c_mom_iq,
data=kidiq, refresh = 0)
print(fit_4c)
stan_glm
family: gaussian [identity]
formula: kid_score ~ c_mom_hs + c_mom_iq + c_mom_hs:c_mom_iq
observations: 434
predictors: 4
------
Median MAD_SD
(Intercept) 87.6 0.9
c_mom_hs 2.9 2.4
c_mom_iq 0.6 0.1
c_mom_hs:c_mom_iq -0.5 0.2
Auxiliary parameter(s):
Median MAD_SD
sigma 18.0 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
kidiq$c2_mom_hs <- kidiq$mom_hs - 0.5
kidiq$c2_mom_iq <- kidiq$mom_iq - 100
fit_4c2 <- stan_glm(kid_score ~ c2_mom_hs + c2_mom_iq + c2_mom_hs:c2_mom_iq,
data=kidiq, refresh = 0)
print(fit_4c2)
stan_glm
family: gaussian [identity]
formula: kid_score ~ c2_mom_hs + c2_mom_iq + c2_mom_hs:c2_mom_iq
observations: 434
predictors: 4
------
Median MAD_SD
(Intercept) 86.8 1.2
c2_mom_hs 2.9 2.4
c2_mom_iq 0.7 0.1
c2_mom_hs:c2_mom_iq -0.5 0.2
Auxiliary parameter(s):
Median MAD_SD
sigma 18.0 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
kidiq$z_mom_hs <- (kidiq$mom_hs - mean(kidiq$mom_hs))/(2*sd(kidiq$mom_hs))
kidiq$z_mom_iq <- (kidiq$mom_iq - mean(kidiq$mom_iq))/(2*sd(kidiq$mom_iq))
fit_4z <- stan_glm(kid_score ~ z_mom_hs + z_mom_iq + z_mom_hs:z_mom_iq,
data=kidiq, refresh = 0)
print(fit_4z)
stan_glm
family: gaussian [identity]
formula: kid_score ~ z_mom_hs + z_mom_iq + z_mom_hs:z_mom_iq
observations: 434
predictors: 4
------
Median MAD_SD
(Intercept) 87.6 0.9
z_mom_hs 2.4 1.9
z_mom_iq 17.6 1.9
z_mom_hs:z_mom_iq -11.8 3.9
Auxiliary parameter(s):
Median MAD_SD
sigma 18.0 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
fit_5 <- stan_glm(kid_score ~ as.factor(mom_work), data=kidiq, refresh = 0)
print(fit_5)
stan_glm
family: gaussian [identity]
formula: kid_score ~ as.factor(mom_work)
observations: 434
predictors: 4
------
Median MAD_SD
(Intercept) 81.9 2.3
as.factor(mom_work)2 4.0 3.1
as.factor(mom_work)3 11.5 3.6
as.factor(mom_work)4 5.3 2.8
Auxiliary parameter(s):
Median MAD_SD
sigma 20.3 0.7
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg