Linear regression with a single predictor. See Chapters 6 and 7 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
x <- 1:20
n <- length(x)
a <- 0.2
b <- 0.3
sigma <- 0.5
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141)
y <- a + b*x + sigma*rnorm(n)
fake <- data.frame(x, y)
The option refresh = 0
supresses the default Stan sampling progress output. This is useful for small data with fast computation. For more complex models and bigger data, it can be useful to see the progress.
fit_1 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0)
print(fit_1, digits=2)
stan_glm
family: gaussian [identity]
formula: y ~ x
observations: 20
predictors: 2
------
Median MAD_SD
(Intercept) 0.40 0.23
x 0.28 0.02
Auxiliary parameter(s):
Median MAD_SD
sigma 0.49 0.08
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
plot(fake$x, fake$y, main="Data and fitted regression line", bty="l", pch=20,
xlab = "x", ylab = "y")
a_hat <- coef(fit_1)[1]
b_hat <- coef(fit_1)[2]
abline(a_hat, b_hat)
x_bar <- mean(fake$x)
text(x_bar, a_hat + b_hat*x_bar, paste(" y =", round(a_hat, 2), "+", round(b_hat, 2), "* x"), adj=0)
n_0 <- 200
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141)
y_0 <- rnorm(n_0, 2.0, 5.0)
fake_0 <- data.frame(y_0)
round(y_0, 1)
[1] 9.1 2.1 0.3 3.3 2.2 12.7 -2.3 1.8 -1.0 4.1 -5.1 7.0 -4.5 -0.8 0.4
[16] 4.0 -1.8 10.2 -0.9 -0.7 4.1 2.6 -1.2 6.7 4.9 1.2 6.4 8.3 -4.8 12.5
[31] 12.6 3.1 -1.5 8.6 12.2 6.3 -2.2 3.3 3.9 -5.9 6.3 -4.0 4.0 7.3 -1.3
[46] -3.3 8.4 7.5 -4.6 -0.7 -0.9 2.4 5.8 6.1 2.6 2.2 3.1 -2.1 3.8 1.6
[61] -4.1 5.6 9.8 0.3 2.0 -3.6 -2.1 -5.4 2.8 -8.0 1.5 4.4 1.8 13.6 5.1
[76] 5.4 2.3 -1.9 -0.1 -5.6 8.1 -1.0 -8.3 7.1 6.6 5.9 4.9 5.6 -3.2 3.8
[91] 2.5 2.3 -2.9 7.0 1.2 5.0 10.6 1.3 11.8 1.1 2.7 7.9 12.1 -1.7 -1.2
[106] 10.3 -6.3 -0.3 -0.2 0.9 -2.2 -3.3 2.6 -1.1 -0.7 0.9 4.0 2.9 6.1 0.9
[121] -3.2 3.0 4.2 0.9 4.6 -3.3 1.3 -6.3 14.6 7.0 -4.9 -5.6 -4.5 -2.1 3.5
[136] 5.9 2.8 -4.3 3.0 2.8 2.7 5.2 -1.7 12.5 -2.4 0.2 6.2 12.1 -6.8 4.2
[151] 0.6 2.2 5.0 8.3 5.3 -3.8 6.4 -4.1 -0.8 -9.9 8.2 -0.9 8.1 13.5 5.8
[166] -0.8 -2.7 7.0 -0.6 -1.3 5.3 -3.2 9.4 6.8 6.7 6.3 2.9 -5.3 -3.1 8.7
[181] -1.5 -0.2 -3.1 -2.3 8.0 0.7 -2.6 0.2 -0.2 1.3 4.5 13.4 0.1 12.7 3.5
[196] 6.0 -0.9 3.3 4.9 14.4
round(mean(y_0), 2)
[1] 2.43
round(sd(y_0)/sqrt(n_0), 2)
[1] 0.36
fit_2 <- stan_glm(y_0 ~ 1, data = fake_0, seed=2141, refresh = 0,
prior_intercept = NULL, prior = NULL, prior_aux = NULL)
print(fit_2)
stan_glm
family: gaussian [identity]
formula: y_0 ~ 1
observations: 200
predictors: 1
------
Median MAD_SD
(Intercept) 2.4 0.4
Auxiliary parameter(s):
Median MAD_SD
sigma 5.1 0.3
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
n_1 <- 300
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141)
y_1 <- rnorm(n_1, 8, 5)
diff <- mean(y_1) - mean(y_0)
se_0 <- sd(y_0)/sqrt(n_0)
se_1 <- sd(y_1)/sqrt(n_1)
se <- sqrt(se_0^2 + se_1^2)
print(diff)
[1] 6.076615
print(se)
[1] 0.460388
n <- n_0 + n_1
y <- c(y_0, y_1)
x <- c(rep(0, n_0), rep(1, n_1))
fake <- data.frame(y, x)
fit_3 <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0,
prior_intercept = NULL, prior = NULL, prior_aux = NULL)
print(fit_3)
stan_glm
family: gaussian [identity]
formula: y ~ x
observations: 500
predictors: 2
------
Median MAD_SD
(Intercept) 2.4 0.4
x 6.1 0.5
Auxiliary parameter(s):
Median MAD_SD
sigma 5.0 0.2
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
par(mar=c(3,3,3,2), mgp=c(1.7,.5,0), tck=-.01)
plot(x, y, xlab="Indicator, x", ylab="y", bty="l", xaxt="n", main="Regression on an indicator is the same\nas computing a difference in means", pch=19, cex=.5)
axis(1, c(0, 1))
abline(h=mean(y[x==0]), lty=2, col="gray50")
abline(h=mean(y[x==1]), lty=2, col="gray50")
abline(coef(fit_3)[1], coef(fit_3)[2])
text(.5, -1 + coef(fit_3)[1] + .5*coef(fit_3)[2], paste("y =", round(coef(fit_3)[1], 2), "+", round(coef(fit_3)[2], 2), "x"), cex=.9, adj=0)
text(.05, -1 + mean(y[x==0]), expression(paste(bar(y)[0], " = 2.68")), col="gray30", cex=.9, adj=0)
text(.95, 1 + mean(y[x==1]), expression(paste(bar(y)[1], " = 8.38")), col="gray30", cex=.9, adj=1)
fit_3b <- stan_glm(y ~ x, data = fake, seed=2141, refresh = 0,
prior=NULL, prior_intercept=NULL, prior_aux=NULL)
print(fit_3b)
stan_glm
family: gaussian [identity]
formula: y ~ x
observations: 500
predictors: 2
------
Median MAD_SD
(Intercept) 2.4 0.4
x 6.1 0.5
Auxiliary parameter(s):
Median MAD_SD
sigma 5.0 0.2
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg