Linear regression and leave-one-out cross-validation. See Chapter 11 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("loo")
library("foreign")
# for reproducability
SEED <- 1507
kidiq <- read.csv(root("KidIQ/data","kidiq.csv"))
head(kidiq)
kid_score mom_hs mom_iq mom_work mom_age
1 65 1 121.11753 4 27
2 98 1 89.36188 4 25
3 85 1 115.44316 4 27
4 83 1 99.44964 3 25
5 115 1 92.74571 4 27
6 98 0 107.90184 1 18
The option refresh = 0
supresses the default Stan sampling progress output. This is useful for small data with fast computation. For more complex models and bigger data, it can be useful to see the progress.
fit_3 <- stan_glm(kid_score ~ mom_hs + mom_iq, data=kidiq,
seed=SEED, refresh = 0)
print(fit_3)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs + mom_iq
observations: 434
predictors: 3
------
Median MAD_SD
(Intercept) 25.8 5.9
mom_hs 5.9 2.2
mom_iq 0.6 0.1
Auxiliary parameter(s):
Median MAD_SD
sigma 18.2 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
using within-sample plug-in (ie with mean parameters) log-score
pluginlogscore_3 <- sum(dnorm(kidiq$kid_score, fitted(fit_3), sigma(fit_3), log = TRUE))
round(pluginlogscore_3, 1)
[1] -1872
using within-sample posterior predictive (ie integrating over parameters) log-score
sigmas <- as.matrix(fit_3)[,'sigma']
preds <- posterior_linpred(fit_3)
nsims <- nrow(preds)
logscore_3 <- sum(log(rowMeans(sapply(1:nsims, FUN = function(i) dnorm(kidiq$kid_score, preds[i,], sigmas[i], log=FALSE)))))
round(logscore_3, 1)
[1] -1872
set.seed(SEED)
n <- nrow(kidiq)
kidiqr <- kidiq
kidiqr$noise <- array(rnorm(5*n), c(n,5))
fit_3n <- stan_glm(kid_score ~ mom_hs + mom_iq + noise, data=kidiqr,
seed=SEED, refresh = 0)
print(fit_3n)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs + mom_iq + noise
observations: 434
predictors: 8
------
Median MAD_SD
(Intercept) 25.4 5.8
mom_hs 6.0 2.3
mom_iq 0.6 0.1
noise1 -0.1 0.9
noise2 -1.3 0.9
noise3 0.0 0.9
noise4 0.2 0.9
noise5 -0.5 0.9
Auxiliary parameter(s):
Median MAD_SD
sigma 18.2 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
using within-sample plug-in (ie with mean parameters) log-score
pluginlogscore_3n <- sum(dnorm(kidiq$kid_score, fitted(fit_3n), sigma(fit_3n), log = TRUE))
round(pluginlogscore_3n, 1)
[1] -1870.9
using within-sample posterior predictive (ie integrating over parameters) log-score
sigmas <- as.matrix(fit_3n)[,'sigma']
preds <- posterior_linpred(fit_3n)
logscore_3n <- sum(log(rowMeans(sapply(1:nsims, FUN = function(i) dnorm(kidiq$kid_score, preds[i,], sigmas[i], log=FALSE)))))
round(logscore_3n, 1)
[1] -1871.1
round(pluginlogscore_3n - pluginlogscore_3, 1)
[1] 1.1
round(logscore_3n - logscore_3, 1)
[1] 0.9
loo_3 <- loo(fit_3)
print(loo_3)
Computed from 4000 by 434 log-likelihood matrix
Estimate SE
elpd_loo -1876.1 14.2
p_loo 4.0 0.4
looic 3752.1 28.5
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
loo_3n <- loo(fit_3n)
print(loo_3n)
Computed from 4000 by 434 log-likelihood matrix
Estimate SE
elpd_loo -1879.8 14.2
p_loo 8.7 0.7
looic 3759.6 28.4
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_3, loo_3n)
elpd_diff se_diff
fit_3 0.0 0.0
fit_3n -3.7 1.5
fit_1 <- stan_glm(kid_score ~ mom_hs, data=kidiq,
seed = SEED, refresh = 0)
loo_1 <- loo(fit_1)
print(loo_1)
Computed from 4000 by 434 log-likelihood matrix
Estimate SE
elpd_loo -1914.8 13.8
p_loo 3.1 0.3
looic 3829.6 27.6
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_3, loo_1)
elpd_diff se_diff
fit_3 0.0 0.0
fit_1 -38.7 8.3
fit_4 <- stan_glm(kid_score ~ mom_hs + mom_iq + mom_iq:mom_hs,
data=kidiq, seed=SEED, refresh = 0)
print(fit_4)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs + mom_iq + mom_iq:mom_hs
observations: 434
predictors: 4
------
Median MAD_SD
(Intercept) -9.8 14.0
mom_hs 49.5 15.2
mom_iq 1.0 0.1
mom_hs:mom_iq -0.5 0.2
Auxiliary parameter(s):
Median MAD_SD
sigma 18.0 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
loo_4 <- loo(fit_4)
print(loo_4)
Computed from 4000 by 434 log-likelihood matrix
Estimate SE
elpd_loo -1872.7 14.4
p_loo 5.0 0.5
looic 3745.3 28.8
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_3, loo_4)
elpd_diff se_diff
fit_4 0.0 0.0
fit_3 -3.4 2.8
fit_5 <- stan_glm(kid_score ~ mom_hs + log(mom_iq) + log(mom_iq):mom_hs,
data=kidiq, seed=SEED, refresh = 0)
print(fit_5)
stan_glm
family: gaussian [identity]
formula: kid_score ~ mom_hs + log(mom_iq) + log(mom_iq):mom_hs
observations: 434
predictors: 4
------
Median MAD_SD
(Intercept) -279.9 50.2
mom_hs 118.6 52.3
log(mom_iq) 79.3 11.1
mom_hs:log(mom_iq) -25.0 11.6
Auxiliary parameter(s):
Median MAD_SD
sigma 17.9 0.6
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
loo_5 <- loo(fit_5)
print(loo_5)
Computed from 4000 by 434 log-likelihood matrix
Estimate SE
elpd_loo -1871.3 14.5
p_loo 4.5 0.4
looic 3742.6 29.0
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_3, loo_5)
elpd_diff se_diff
fit_5 0.0 0.0
fit_3 -4.8 2.0
loo_compare(loo_1, loo_3, loo_4, loo_5)
elpd_diff se_diff
fit_5 0.0 0.0
fit_4 -1.4 1.3
fit_3 -4.8 2.0
fit_1 -43.5 8.9