Predicting the yields of mesquite bushes. See Chapter 12 in Regression and Other Stories.


Load packages

library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("foreign")
library("rstanarm")
library("loo")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))

Set random seed for reproducability

SEED <- 4587

Load data

mesquite <- read.table(root("Mesquite/data","mesquite.dat"), header=TRUE)
head(mesquite)
  obs group diam1 diam2 total_height canopy_height density weight
1   1   MCD   1.8  1.15         1.30          1.00       1  401.3
2   2   MCD   1.7  1.35         1.35          1.33       1  513.7
3   3   MCD   2.8  2.55         2.16          0.60       1 1179.2
4   4   MCD   1.3  0.85         1.80          1.20       1  308.0
5   5   MCD   3.3  1.90         1.55          1.05       1  855.2
6   6   MCD   1.4  1.40         1.20          1.00       1  268.7

Predict weight given all the predictors

fit_1 <- stan_glm(weight ~ diam1 + diam2 + canopy_height + total_height +
                    density + group, data=mesquite, seed=SEED, refresh=0)
print(fit_1)
stan_glm
 family:       gaussian [identity]
 formula:      weight ~ diam1 + diam2 + canopy_height + total_height + density + 
       group
 observations: 46
 predictors:   7
------
              Median  MAD_SD 
(Intercept)   -1087.1   177.1
diam1           186.9   114.4
diam2           370.4   124.6
canopy_height   349.3   211.1
total_height    -99.9   188.7
density         131.4    34.7
groupMCD        363.5    99.6

Auxiliary parameter(s):
      Median MAD_SD
sigma 273.9   30.8 

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
(loo_1 <- loo(fit_1))

Computed from 4000 by 46 log-likelihood matrix

         Estimate   SE
elpd_loo   -334.1 12.9
p_loo        15.9  8.8
looic       668.3 25.8
------
Monte Carlo SE of elpd_loo is NA.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)     43    93.5%   698       
 (0.5, 0.7]   (ok)        1     2.2%   102       
   (0.7, 1]   (bad)       1     2.2%   113       
   (1, Inf)   (very bad)  1     2.2%   2         
See help('pareto-k-diagnostic') for details.

We get warnings about high Pareto k values, which indicates that the importance sampling approximation used in loo is in this case unreliable. We thus use more robust K-fold-CV.

kfold_1 <- kfold(fit_1, K=10)
kfold_1

Based on 10-fold cross-validation

           Estimate   SE
elpd_kfold   -347.4 26.9
p_kfold          NA   NA
kfoldic       694.8 53.9

Predict log(weight) given log transformed predictors

fit_2 <- stan_glm(log(weight) ~ log(diam1) + log(diam2) + log(canopy_height) +
                      log(total_height) + log(density) + group,
                  data=mesquite, seed=SEED, refresh=0)
(loo_2 <- loo(fit_2))

Computed from 4000 by 46 log-likelihood matrix

         Estimate   SE
elpd_loo    -19.2  5.3
p_loo         7.5  1.5
looic        38.5 10.7
------
Monte Carlo SE of elpd_loo is NA.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)     44    95.7%   945       
 (0.5, 0.7]   (ok)        1     2.2%   374       
   (0.7, 1]   (bad)       1     2.2%   555       
   (1, Inf)   (very bad)  0     0.0%   <NA>      
See help('pareto-k-diagnostic') for details.

Jacobian correction to make the models comparable

Jacobian correction is needed as model 1 is modeling y and model 2 is modeling log(y).

loo_2_with_jacobian <- loo_2
loo_2_with_jacobian$pointwise[,1] <- loo_2_with_jacobian$pointwise[,1]-
                                     log(mesquite$weight)
(elpd_loo_2_with_jacobian <- sum(loo_2_with_jacobian$pointwise[,1]))
[1] -291.5386

there will be a warning that the target data is not the same same, this is ok because we have the jacobian correction

loo_compare(kfold_1, loo_2_with_jacobian)
      elpd_diff se_diff
fit_2   0.0       0.0  
fit_1 -55.9      24.6  

Posterior predictive checking for model in original scale

yrep_1 <- posterior_predict(fit_1)
n_sims <- nrow(yrep_1)
sims_display <- sample(n_sims, 100)
ppc_1 <- ppc_dens_overlay(mesquite$weight, yrep_1[sims_display,]) +
    theme(axis.line.y = element_blank())

Posterior predictive checking for model in log scale

yrep_2 <- posterior_predict(fit_2)
ppc_2 <- ppc_dens_overlay(log(mesquite$weight), yrep_2[sims_display,]) +
  theme(axis.line.y = element_blank())
bpg <- bayesplot_grid(
  ppc_1, ppc_2,
  grid_args = list(ncol = 2),
  titles = c("Model for weight", "Model for log(weight)")
)

Display posterior predictive checking plots

bpg

Plot marginal posteriors

mcmc_areas(as.matrix(fit_2), regex_pars = "^log|^gro")

**Plot joint marginal posterior for log(canopy_height) and log(total_height)

mcmc_scatter(as.matrix(fit_2), pars = c("log(canopy_height)","log(total_height)"), size = 1) +
    geom_vline(xintercept=0) +
    geom_hline(yintercept=0) +
    labs(x="coef of log(canopy_height)", y="coef of log(total_height)")

Additional transformed variables

mesquite$canopy_volume <- mesquite$diam1 * mesquite$diam2 * mesquite$canopy_height
mesquite$canopy_area <- mesquite$diam1 * mesquite$diam2
mesquite$canopy_shape <- mesquite$diam1 / mesquite$diam2

A model with a canopy volume variable

fit_3 <- stan_glm(log(weight) ~ log(canopy_volume), data=mesquite,
                  seed=SEED, refresh=0)
print(fit_3)
stan_glm
 family:       gaussian [identity]
 formula:      log(weight) ~ log(canopy_volume)
 observations: 46
 predictors:   2
------
                   Median MAD_SD
(Intercept)        5.2    0.1   
log(canopy_volume) 0.7    0.1   

Auxiliary parameter(s):
      Median MAD_SD
sigma 0.4    0.0   

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
loo_3 <- loo(fit_3)

Both models are modeling log(y) and can be compared directly.

loo_compare(loo_2, loo_3)
      elpd_diff se_diff
fit_2  0.0       0.0   
fit_3 -7.4       5.0   

Compare also LOO-R^2

round(median(loo_R2(fit_2)),2)
[1] 0.85
round(median(loo_R2(fit_3)),2)
[1] 0.79

Compare Bayesian R^2

round(median(bayes_R2(fit_2)),2)
[1] 0.87
round(median(bayes_R2(fit_3)),2)
[1] 0.79

Add canopy area and canopy shape

fit_4 <- stan_glm(log(weight) ~ log(canopy_volume) +
                      log(canopy_area) + log(canopy_shape) +
                      log(total_height) + log(density) + group,
                  data=mesquite, seed=SEED, refresh=0)
print(fit_4)
stan_glm
 family:       gaussian [identity]
 formula:      log(weight) ~ log(canopy_volume) + log(canopy_area) + log(canopy_shape) + 
       log(total_height) + log(density) + group
 observations: 46
 predictors:   7
------
                   Median MAD_SD
(Intercept)         4.8    0.2  
log(canopy_volume)  0.4    0.3  
log(canopy_area)    0.4    0.3  
log(canopy_shape)  -0.4    0.2  
log(total_height)   0.4    0.3  
log(density)        0.1    0.1  
groupMCD            0.6    0.1  

Auxiliary parameter(s):
      Median MAD_SD
sigma 0.3    0.0   

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
(loo_4 <- loo(fit_4))

Computed from 4000 by 46 log-likelihood matrix

         Estimate   SE
elpd_loo    -19.3  5.3
p_loo         7.5  1.5
looic        38.6 10.7
------
Monte Carlo SE of elpd_loo is 0.1.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)     44    95.7%   925       
 (0.5, 0.7]   (ok)        2     4.3%   268       
   (0.7, 1]   (bad)       0     0.0%   <NA>      
   (1, Inf)   (very bad)  0     0.0%   <NA>      

All Pareto k estimates are ok (k < 0.7).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_2, loo_4)
      elpd_diff se_diff
fit_2  0.0       0.0   
fit_4 -0.1       0.1   
round(median(loo_R2(fit_4)),2)
[1] 0.85
round(median(bayes_R2(fit_4)),2)
[1] 0.87

Plot Bayesian R^2

mcmc_hist(data.frame(bayes_R2(fit_4)), binwidth=0.005)+
  xlab('Bayesian R^2') + scale_y_continuous(breaks=NULL)

Plot marginals

mcmc_areas(as.matrix(fit_4))

Plot pairwise joint marginals

Strong collinearity between canopy volume and canopy area is obvious

mcmc_pairs(as.matrix(fit_4), pars=c("log(canopy_volume)","log(canopy_area)",
                                    "log(canopy_shape)","log(total_height)",
                                    "log(density)"))

A model with just canopy volume and canopy shape

fit_5 <- stan_glm(log(weight) ~ log(canopy_volume) + log(canopy_shape) +
    group, data=mesquite, seed=SEED, refresh=0)
(loo_5 <- loo(fit_5))

Computed from 4000 by 46 log-likelihood matrix

         Estimate   SE
elpd_loo    -18.0  5.4
p_loo         5.4  1.3
looic        36.0 10.9
------
Monte Carlo SE of elpd_loo is 0.1.

Pareto k diagnostic values:
                         Count Pct.    Min. n_eff
(-Inf, 0.5]   (good)     44    95.7%   1195      
 (0.5, 0.7]   (ok)        2     4.3%   530       
   (0.7, 1]   (bad)       0     0.0%   <NA>      
   (1, Inf)   (very bad)  0     0.0%   <NA>      

All Pareto k estimates are ok (k < 0.7).
See help('pareto-k-diagnostic') for details.
loo_compare(loo_4, loo_5)
      elpd_diff se_diff
fit_5  0.0       0.0   
fit_4 -1.3       1.5   
round(median(loo_R2(fit_5)),2)
[1] 0.85
round(median(bayes_R2(fit_5)),2)
[1] 0.87
---
title: "Regression and Other Stories: Mesquite"
author: "Andrew Gelman, Jennifer Hill, Aki Vehtari"
date: "`r format(Sys.Date())`"
output:
  html_document:
    theme: readable
    toc: true
    toc_depth: 2
    toc_float: true
    code_download: true
---
Predicting the yields of mesquite bushes. See Chapter 12 in
Regression and Other Stories.

-------------


```{r setup, include=FALSE}
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
# switch this to TRUE to save figures in separate files
savefigs <- FALSE
```

#### Load packages

```{r }
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("foreign")
library("rstanarm")
library("loo")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))
```
```{r eval=FALSE, include=FALSE}
# grayscale figures for the book
if (savefigs) color_scheme_set(scheme = "gray")
```

Set random seed for reproducability

```{r }
SEED <- 4587
```

#### Load data

```{r }
mesquite <- read.table(root("Mesquite/data","mesquite.dat"), header=TRUE)
head(mesquite)
```

## Predict weight given all the predictors

```{r }
fit_1 <- stan_glm(weight ~ diam1 + diam2 + canopy_height + total_height +
                    density + group, data=mesquite, seed=SEED, refresh=0)
print(fit_1)
(loo_1 <- loo(fit_1))
```

We get warnings about high Pareto k values, which indicates that
the importance sampling approximation used in loo is in this case
unreliable. We thus use more robust K-fold-CV.

```{r }
kfold_1 <- kfold(fit_1, K=10)
kfold_1
```

## Predict log(weight) given log transformed predictors

```{r }
fit_2 <- stan_glm(log(weight) ~ log(diam1) + log(diam2) + log(canopy_height) +
                      log(total_height) + log(density) + group,
                  data=mesquite, seed=SEED, refresh=0)
(loo_2 <- loo(fit_2))
```

#### Jacobian correction to make the models comparable<br>
Jacobian correction is needed as model 1 is modeling y and model 2
is modeling log(y).

```{r }
loo_2_with_jacobian <- loo_2
loo_2_with_jacobian$pointwise[,1] <- loo_2_with_jacobian$pointwise[,1]-
                                     log(mesquite$weight)
(elpd_loo_2_with_jacobian <- sum(loo_2_with_jacobian$pointwise[,1]))
```

there will be a warning that the target data is not the same same, 
this is ok because we have the jacobian correction

```{r }
loo_compare(kfold_1, loo_2_with_jacobian)
```

#### Posterior predictive checking for model in original scale

```{r }
yrep_1 <- posterior_predict(fit_1)
n_sims <- nrow(yrep_1)
sims_display <- sample(n_sims, 100)
ppc_1 <- ppc_dens_overlay(mesquite$weight, yrep_1[sims_display,]) +
    theme(axis.line.y = element_blank())
```

#### Posterior predictive checking for model in log scale

```{r }
yrep_2 <- posterior_predict(fit_2)
ppc_2 <- ppc_dens_overlay(log(mesquite$weight), yrep_2[sims_display,]) +
  theme(axis.line.y = element_blank())
bpg <- bayesplot_grid(
  ppc_1, ppc_2,
  grid_args = list(ncol = 2),
  titles = c("Model for weight", "Model for log(weight)")
)
```

#### Display posterior predictive checking plots

```{r }
bpg
```
```{r eval=FALSE, include=FALSE}
if (savefigs)
  ggsave(root("Mesquite/figs","mesquite_ppc.pdf"), bpg, height=3, width=9, colormodel="gray")
```

#### Plot marginal posteriors

```{r fig.height=3, fig.width=6}
mcmc_areas(as.matrix(fit_2), regex_pars = "^log|^gro")
```
```{r eval=FALSE, include=FALSE}
if (savefigs)
    ggsave(root("Mesquite/figs","mesquite_areas.pdf"), height=3.5, width=5, colormodel="gray")
```

**Plot joint marginal posterior for log(canopy_height) and log(total_height)

```{r }
mcmc_scatter(as.matrix(fit_2), pars = c("log(canopy_height)","log(total_height)"), size = 1) +
    geom_vline(xintercept=0) +
    geom_hline(yintercept=0) +
    labs(x="coef of log(canopy_height)", y="coef of log(total_height)")
```
```{r eval=FALSE, include=FALSE}
if (savefigs)
    ggsave(root("Mesquite/figs","mesquite_scatter.pdf"), height=3.5, width=5, colormodel="gray")
```

## Additional transformed variables

```{r }
mesquite$canopy_volume <- mesquite$diam1 * mesquite$diam2 * mesquite$canopy_height
mesquite$canopy_area <- mesquite$diam1 * mesquite$diam2
mesquite$canopy_shape <- mesquite$diam1 / mesquite$diam2
```

## A model with a canopy volume variable

```{r }
fit_3 <- stan_glm(log(weight) ~ log(canopy_volume), data=mesquite,
                  seed=SEED, refresh=0)
print(fit_3)
loo_3 <- loo(fit_3)
```

Both models are modeling log(y) and can be compared directly.

```{r }
loo_compare(loo_2, loo_3)
```

#### Compare also LOO-R^2

```{r }
round(median(loo_R2(fit_2)),2)
round(median(loo_R2(fit_3)),2)
```

#### Compare Bayesian R^2

```{r }
round(median(bayes_R2(fit_2)),2)
round(median(bayes_R2(fit_3)),2)
```

## Add canopy area and canopy shape

```{r }
fit_4 <- stan_glm(log(weight) ~ log(canopy_volume) +
                      log(canopy_area) + log(canopy_shape) +
                      log(total_height) + log(density) + group,
                  data=mesquite, seed=SEED, refresh=0)
print(fit_4)
(loo_4 <- loo(fit_4))
loo_compare(loo_2, loo_4)
round(median(loo_R2(fit_4)),2)
round(median(bayes_R2(fit_4)),2)
```

#### Plot Bayesian R^2

```{r results='hide', fig.height=3, fig.width=6}
mcmc_hist(data.frame(bayes_R2(fit_4)), binwidth=0.005)+
  xlab('Bayesian R^2') + scale_y_continuous(breaks=NULL)
```

#### Plot marginals

```{r fig.height=3, fig.width=6}
mcmc_areas(as.matrix(fit_4))
```

#### Plot pairwise joint marginals<br>
Strong collinearity between canopy volume and canopy area is obvious

```{r fig.width=8, fig.height=8}
mcmc_pairs(as.matrix(fit_4), pars=c("log(canopy_volume)","log(canopy_area)",
                                    "log(canopy_shape)","log(total_height)",
                                    "log(density)"))
```

## A model with just canopy volume and canopy shape

```{r }
fit_5 <- stan_glm(log(weight) ~ log(canopy_volume) + log(canopy_shape) +
    group, data=mesquite, seed=SEED, refresh=0)
(loo_5 <- loo(fit_5))
loo_compare(loo_4, loo_5)
round(median(loo_R2(fit_5)),2)
round(median(bayes_R2(fit_5)),2)
```

