Predicting the yields of mesquite bushes. See Chapter 12 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("foreign")
library("rstanarm")
library("loo")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))
Set random seed for reproducability
SEED <- 4587
mesquite <- read.table(root("Mesquite/data","mesquite.dat"), header=TRUE)
head(mesquite)
obs group diam1 diam2 total_height canopy_height density weight
1 1 MCD 1.8 1.15 1.30 1.00 1 401.3
2 2 MCD 1.7 1.35 1.35 1.33 1 513.7
3 3 MCD 2.8 2.55 2.16 0.60 1 1179.2
4 4 MCD 1.3 0.85 1.80 1.20 1 308.0
5 5 MCD 3.3 1.90 1.55 1.05 1 855.2
6 6 MCD 1.4 1.40 1.20 1.00 1 268.7
fit_1 <- stan_glm(weight ~ diam1 + diam2 + canopy_height + total_height +
density + group, data=mesquite, seed=SEED, refresh=0)
print(fit_1)
stan_glm
family: gaussian [identity]
formula: weight ~ diam1 + diam2 + canopy_height + total_height + density +
group
observations: 46
predictors: 7
------
Median MAD_SD
(Intercept) -1087.1 177.1
diam1 186.9 114.4
diam2 370.4 124.6
canopy_height 349.3 211.1
total_height -99.9 188.7
density 131.4 34.7
groupMCD 363.5 99.6
Auxiliary parameter(s):
Median MAD_SD
sigma 273.9 30.8
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
(loo_1 <- loo(fit_1))
Computed from 4000 by 46 log-likelihood matrix
Estimate SE
elpd_loo -334.1 12.9
p_loo 15.9 8.8
looic 668.3 25.8
------
Monte Carlo SE of elpd_loo is NA.
Pareto k diagnostic values:
Count Pct. Min. n_eff
(-Inf, 0.5] (good) 43 93.5% 698
(0.5, 0.7] (ok) 1 2.2% 102
(0.7, 1] (bad) 1 2.2% 113
(1, Inf) (very bad) 1 2.2% 2
See help('pareto-k-diagnostic') for details.
We get warnings about high Pareto k values, which indicates that the importance sampling approximation used in loo is in this case unreliable. We thus use more robust K-fold-CV.
kfold_1 <- kfold(fit_1, K=10)
kfold_1
Based on 10-fold cross-validation
Estimate SE
elpd_kfold -347.4 26.9
p_kfold NA NA
kfoldic 694.8 53.9
fit_2 <- stan_glm(log(weight) ~ log(diam1) + log(diam2) + log(canopy_height) +
log(total_height) + log(density) + group,
data=mesquite, seed=SEED, refresh=0)
(loo_2 <- loo(fit_2))
Computed from 4000 by 46 log-likelihood matrix
Estimate SE
elpd_loo -19.2 5.3
p_loo 7.5 1.5
looic 38.5 10.7
------
Monte Carlo SE of elpd_loo is NA.
Pareto k diagnostic values:
Count Pct. Min. n_eff
(-Inf, 0.5] (good) 44 95.7% 945
(0.5, 0.7] (ok) 1 2.2% 374
(0.7, 1] (bad) 1 2.2% 555
(1, Inf) (very bad) 0 0.0% <NA>
See help('pareto-k-diagnostic') for details.
Jacobian correction is needed as model 1 is modeling y and model 2 is modeling log(y).
loo_2_with_jacobian <- loo_2
loo_2_with_jacobian$pointwise[,1] <- loo_2_with_jacobian$pointwise[,1]-
log(mesquite$weight)
(elpd_loo_2_with_jacobian <- sum(loo_2_with_jacobian$pointwise[,1]))
[1] -291.5386
there will be a warning that the target data is not the same same, this is ok because we have the jacobian correction
loo_compare(kfold_1, loo_2_with_jacobian)
elpd_diff se_diff
fit_2 0.0 0.0
fit_1 -55.9 24.6
yrep_1 <- posterior_predict(fit_1)
n_sims <- nrow(yrep_1)
sims_display <- sample(n_sims, 100)
ppc_1 <- ppc_dens_overlay(mesquite$weight, yrep_1[sims_display,]) +
theme(axis.line.y = element_blank())
yrep_2 <- posterior_predict(fit_2)
ppc_2 <- ppc_dens_overlay(log(mesquite$weight), yrep_2[sims_display,]) +
theme(axis.line.y = element_blank())
bpg <- bayesplot_grid(
ppc_1, ppc_2,
grid_args = list(ncol = 2),
titles = c("Model for weight", "Model for log(weight)")
)
bpg
mcmc_areas(as.matrix(fit_2), regex_pars = "^log|^gro")
**Plot joint marginal posterior for log(canopy_height) and log(total_height)
mcmc_scatter(as.matrix(fit_2), pars = c("log(canopy_height)","log(total_height)"), size = 1) +
geom_vline(xintercept=0) +
geom_hline(yintercept=0) +
labs(x="coef of log(canopy_height)", y="coef of log(total_height)")
mesquite$canopy_volume <- mesquite$diam1 * mesquite$diam2 * mesquite$canopy_height
mesquite$canopy_area <- mesquite$diam1 * mesquite$diam2
mesquite$canopy_shape <- mesquite$diam1 / mesquite$diam2
fit_3 <- stan_glm(log(weight) ~ log(canopy_volume), data=mesquite,
seed=SEED, refresh=0)
print(fit_3)
stan_glm
family: gaussian [identity]
formula: log(weight) ~ log(canopy_volume)
observations: 46
predictors: 2
------
Median MAD_SD
(Intercept) 5.2 0.1
log(canopy_volume) 0.7 0.1
Auxiliary parameter(s):
Median MAD_SD
sigma 0.4 0.0
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
loo_3 <- loo(fit_3)
Both models are modeling log(y) and can be compared directly.
loo_compare(loo_2, loo_3)
elpd_diff se_diff
fit_2 0.0 0.0
fit_3 -7.4 5.0
round(median(loo_R2(fit_2)),2)
[1] 0.85
round(median(loo_R2(fit_3)),2)
[1] 0.79
round(median(bayes_R2(fit_2)),2)
[1] 0.87
round(median(bayes_R2(fit_3)),2)
[1] 0.79
fit_4 <- stan_glm(log(weight) ~ log(canopy_volume) +
log(canopy_area) + log(canopy_shape) +
log(total_height) + log(density) + group,
data=mesquite, seed=SEED, refresh=0)
print(fit_4)
stan_glm
family: gaussian [identity]
formula: log(weight) ~ log(canopy_volume) + log(canopy_area) + log(canopy_shape) +
log(total_height) + log(density) + group
observations: 46
predictors: 7
------
Median MAD_SD
(Intercept) 4.8 0.2
log(canopy_volume) 0.4 0.3
log(canopy_area) 0.4 0.3
log(canopy_shape) -0.4 0.2
log(total_height) 0.4 0.3
log(density) 0.1 0.1
groupMCD 0.6 0.1
Auxiliary parameter(s):
Median MAD_SD
sigma 0.3 0.0
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
(loo_4 <- loo(fit_4))
Computed from 4000 by 46 log-likelihood matrix
Estimate SE
elpd_loo -19.3 5.3
p_loo 7.5 1.5
looic 38.6 10.7
------
Monte Carlo SE of elpd_loo is 0.1.
Pareto k diagnostic values:
Count Pct. Min. n_eff
(-Inf, 0.5] (good) 44 95.7% 925
(0.5, 0.7] (ok) 2 4.3% 268
(0.7, 1] (bad) 0 0.0% <NA>
(1, Inf) (very bad) 0 0.0% <NA>
All Pareto k estimates are ok (k < 0.7).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_2, loo_4)
elpd_diff se_diff
fit_2 0.0 0.0
fit_4 -0.1 0.1
round(median(loo_R2(fit_4)),2)
[1] 0.85
round(median(bayes_R2(fit_4)),2)
[1] 0.87
mcmc_hist(data.frame(bayes_R2(fit_4)), binwidth=0.005)+
xlab('Bayesian R^2') + scale_y_continuous(breaks=NULL)
mcmc_areas(as.matrix(fit_4))
Strong collinearity between canopy volume and canopy area is obvious
mcmc_pairs(as.matrix(fit_4), pars=c("log(canopy_volume)","log(canopy_area)",
"log(canopy_shape)","log(total_height)",
"log(density)"))
fit_5 <- stan_glm(log(weight) ~ log(canopy_volume) + log(canopy_shape) +
group, data=mesquite, seed=SEED, refresh=0)
(loo_5 <- loo(fit_5))
Computed from 4000 by 46 log-likelihood matrix
Estimate SE
elpd_loo -18.0 5.4
p_loo 5.4 1.3
looic 36.0 10.9
------
Monte Carlo SE of elpd_loo is 0.1.
Pareto k diagnostic values:
Count Pct. Min. n_eff
(-Inf, 0.5] (good) 44 95.7% 1195
(0.5, 0.7] (ok) 2 4.3% 530
(0.7, 1] (bad) 0 0.0% <NA>
(1, Inf) (very bad) 0 0.0% <NA>
All Pareto k estimates are ok (k < 0.7).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_4, loo_5)
elpd_diff se_diff
fit_5 0.0 0.0
fit_4 -1.3 1.5
round(median(loo_R2(fit_5)),2)
[1] 0.85
round(median(bayes_R2(fit_5)),2)
[1] 0.87