Code for producing figures in StanCon 2018 Helsinki tutorial

Load packages

library(dplyr)
library(tidyr)
library("rstanarm")
options(mc.cores = parallel::detectCores())
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans", base_size=16))
library(brms)
library(loo)

Simple fake data

Simulate fake data

x <- 1:20
n <- length(x)
a <- 0.2
b <- 0.3
sigma <- 1
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141) 
y <- a + b*x + sigma*rnorm(n)
fake <- data.frame(x, y)
# another random seed for second data realisation
set.seed(12241) 
y2 <- a + b*x + sigma*rnorm(n)
fake2 <- data.frame(x, y=y2)
# data with random x
set.seed(2141) 
x <- rnorm(20, 10, 5)
y <- a + b*x + sigma*rnorm(n)
faker <- data.frame(x, y)

Fit linear model

fit_1 <- stan_glm(y ~ x, data = fake, seed=2141)
fit_2 <- stan_glm(y ~ x, data = fake2, seed=2141)

print(fit_1, digits=2)

## stan_glm
##  family:       gaussian [identity]
##  formula:      y ~ x
##  observations: 20
##  predictors:   2
## ------
##             Median MAD_SD
## (Intercept) 0.60   0.46  
## x           0.26   0.04  
## 
## Auxiliary parameter(s):
##       Median MAD_SD
## sigma 0.98   0.17  
## 
## Sample avg. posterior predictive distribution of y:
##          Median MAD_SD
## mean_PPD 3.34   0.31  
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Compute exact leave-one-out cross-validation for i=18

fit_1loo <- stan_glm(y ~ x, data = fake[-18,], seed=2141)
fit_2loo <- stan_glm(y ~ x, data = fake2[-18,], seed=2141)

Extract posterior draws

sims <- as.matrix(fit_1)
sims2 <- as.matrix(fit_2)
simsloo <- as.matrix(fit_1loo)
sims2loo <- as.matrix(fit_2loo)
n_sims <- nrow(sims)

Compute conditional distribution given x=18

cond<-data.frame(y=seq(0,9,length.out=100))
cond$x <- dnorm(cond$y, a + b*18, sigma)*6+18

Compute posterior predictive distribution given x=18

condpred<-data.frame(y=seq(0,9,length.out=100))
condpred$x <- sapply(condpred$y, FUN=function(y) mean(dnorm(y, sims[,1] + sims[,2]*18, sims[,3])*6+18))

Compute posterior predictive distribution given x=18 for second data set

condpred2<-data.frame(y=seq(0,10,length.out=100))
condpred2$x <- sapply(condpred2$y, FUN=function(y) mean(dnorm(y, sims2[,1] + sims2[,2]*18, sims2[,3])*6+18))

Compute LOO posterior predictive distribution given x=18

condpredloo<-data.frame(y=seq(0,9,length.out=100))
condpredloo$x <- sapply(condpredloo$y, FUN=function(y) mean(dnorm(y, simsloo[,1] + simsloo[,2]*18, simsloo[,3])*6+18))

Compute LOO posterior predictive distribution given x=18 for second data set

condpred2loo<-data.frame(y=seq(0,10,length.out=100))
condpred2loo$x <- sapply(condpred2loo$y, FUN=function(y) mean(dnorm(y, sims2loo[,1] + sims2loo[,2]*18, sims2loo[,3])*6+18))

Plot mean of data generating mechanism

p1 <- ggplot(fake, aes(x = x, y = y)) +
    geom_abline(intercept=a, slope=b) +
    xlim(0,21) + ylim(0,9) +
    labs(title = "True mean y = a + bx") 
p1

ggsave('figs/fake1.pdf', p1, width=6, height=4)

Plot data generating mechanism

p2 <- p1 +
    geom_path(data=cond,aes(x=x,y=y)) + geom_vline(xintercept=18, linetype=2) +
    labs(title = "True mean and sigma") 
p2

ggsave('figs/fake2.pdf', p2, width=6, height=4)

Plot one realisation from the data generating mechanism

p3 <- p2 +
  geom_point(color = "white", size = 3) +
  geom_point(color = "black", size = 2) +
  labs(title = "Data") 
p3

ggsave('figs/fake3.pdf', p3, width=6, height=4)

Plot posterior mean

p4 <- p3 +
  geom_abline(
    intercept = mean(sims[, 1]),
    slope = mean(sims[, 2]),
    size = 1,
    color = "red"
  ) +
  labs(title = "Posterior mean") 
p4

ggsave('figs/fake4.pdf', p4, width=6, height=4)

Plot posterior mean given second data set

p4b <- p2 +
  geom_point(data=fake2, color = "white", size = 3) +
  geom_point(data=fake2, color = "black", size = 2) +
  geom_abline(
    intercept = mean(sims2[, 1]),
    slope = mean(sims2[, 2]),
    size = 1,
    color = "red"
  ) +
  labs(title = "Posterior mean, alternative data realisation") 
p4b

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

ggsave('figs/fake4b.pdf', p4b, width=6, height=4)

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

Plot posterior draws

p4s <- p4 +
    geom_abline(
        intercept = sims[seq(1,1001,by=10), 1],
        slope = sims[seq(1,1001,by=10), 2],
        size = 0.1,
        color = "blue",
        alpha = 0.1
    ) + ggtitle("Posterior draws")
p4s

ggsave('figs/fake4s.pdf', p4s, width=6, height=4)

Plot posterior predictive distribution

p5 <- p4 +
    geom_path(data=condpred,aes(x=x,y=y), color="red") +
    geom_vline(xintercept=18, linetype=2, color="red") +
    ggtitle("Posterior predictive distribution")
p5

ggsave('figs/fake5.pdf', p5, width=6, height=4)

Plot posterior predictive distribution given second data set

p5b <- p4b +
    geom_path(data=condpred2,aes(x=x,y=y), color="red") +
    geom_vline(xintercept=18, linetype=2, color="red") +
    ggtitle("Posterior predictive distribution")
p5b

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).

ggsave('figs/fake5b.pdf', p5b, width=6, height=4)

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).

p5

Plot new data

p5 + geom_point(data=fake2, color = "black", size = 2, shape=1) +
    ggtitle("New data")

## Warning: Removed 1 rows containing missing values (geom_point).

ggsave('figs/fake5n.pdf', width=6, height=4)

## Warning: Removed 1 rows containing missing values (geom_point).

Highlight 18th data point

p6 <- p5 +
    geom_point(data=fake[18,], color = "forestgreen", size = 5, shape=1)
p6

ggsave('figs/fake6.pdf', p6, width=6, height=4)

Plot leave-one-out predictive mean

p7 <- p6 +
  geom_abline(
    intercept = mean(simsloo[, 1]),
    slope = mean(simsloo[, 2]),
    size = 1,
    color = "forestgreen"
  ) +
    ggtitle("Leave-one-out mean")
p7

ggsave('figs/fake7.pdf', p7, width=6, height=4)

Illustrate leave-one-out residual

p7 + geom_segment(x=18, y=mean(simsloo[,1] + simsloo[,2]*18),
                  xend=18, yend=fake[18,"y"],
                  color="blue", size=1,
                  arrow = arrow(length = unit(4, "mm"))) +
    ggtitle("Leave-one-out residual") +
    geom_text(x=17,y=6,label=round(fake[18,"y"]-mean(simsloo[,1] + simsloo[,2]*18),1),color="blue")

ggsave('figs/fake7r.pdf', width=6, height=4)

Plot leave-one-out predictive distribution

p8 <- p7 +
    geom_path(data=condpredloo,aes(x=x,y=y), color="forestgreen") +
    geom_vline(xintercept=18, linetype=2, color="forestgreen") +
    ggtitle("Leave-one-out predictive distribution")
p8

ggsave('figs/fake8.pdf', p8, width=6, height=4)

Illustrate posterior predictive density given 18th data point

pd18<-mean(dnorm(fake[18,"y"], sims[,1] + sims[,2]*18, sims[,3]))
p8 + geom_segment(x=18, y=fake[18,"y"], yend=fake[18,"y"],
                  xend=18+pd18*6,
                  color="blue", size=1) +
    ggtitle("Posterior predictive density") +
    geom_text(x=17,y=fake[18,"y"],label=round(pd18,2),color="blue")

ggsave('figs/fake8pd.pdf', width=6, height=4)

Illustrate leave-one-out predictive density given 18th data point

pd18<-mean(dnorm(fake[18,"y"], simsloo[,1] + simsloo[,2]*18, simsloo[,3]))
p8 + geom_segment(x=18, y=fake[18,"y"], yend=fake[18,"y"],
                  xend=mean(dnorm(fake[18,"y"], simsloo[,1] + simsloo[,2]*18, simsloo[,3])*6+18),
                  color="blue", size=1) +
    ggtitle("Leave-one-out predictive density") +
    geom_text(x=17,y=fake[18,"y"],label=round(pd18,2),color="blue")

ggsave('figs/fake8loopd.pdf', width=6, height=4)

Compute PSIS-LOO

loo_1 <- loo(fit_1, save_psis=TRUE)
round(exp(loo_1$pointwise[,1]),2)

##  [1] 0.17 0.34 0.29 0.37 0.37 0.03 0.22 0.38 0.31 0.35 0.13 0.21 0.17 0.35
## [15] 0.38 0.31 0.32 0.03 0.35 0.35

Leave-one-out predictive densities

p8 + geom_text(data=fake[seq(1,20,by=2),], aes(x=x, y=rep(8.5,10)), label=signif(exp(loo_1$pointwise[seq(1,20,by=2),1]),1),
               color="blue") +
    geom_segment(data=fake[seq(1,20,by=2),], aes(x=x, y=y, xend=x, yend=rep(8.5,10)),
                 linetype=3, size=0.2) +
    geom_text(data=fake[seq(2,20,by=2),], aes(x=x, y=rep(9,10)), label=signif(exp(loo_1$pointwise[seq(2,20,by=2),1]),1),
               color="blue") +
    geom_segment(data=fake[seq(2,20,by=2),], aes(x=x, y=y, xend=x, yend=rep(9,10)),
                 linetype=3, size=0.2) +
    ggtitle("Leave-one-out predictive densities")

ggsave('figs/fake8loopds.pdf', width=6, height=4)

Leave-one-out log predictive densities

p8 + geom_text(data=fake[seq(1,20,by=2),], aes(x=x, y=rep(8.5,10)), label=signif((loo_1$pointwise[seq(1,20,by=2),1]),2),
               color="blue") +
    geom_segment(data=fake[seq(1,20,by=2),], aes(x=x, y=y, xend=x, yend=rep(8.5,10)),
                 linetype=3, size=0.2) +
    geom_text(data=fake[seq(2,20,by=2),], aes(x=x, y=rep(9,10)), label=signif((loo_1$pointwise[seq(2,20,by=2),1]),2),
               color="blue") +
    geom_segment(data=fake[seq(2,20,by=2),], aes(x=x, y=y, xend=x, yend=rep(9,10)),
                 linetype=3, size=0.2) +
    ggtitle("Leave-one-out log predictive densities")

ggsave('figs/fake8loolpds.pdf', width=6, height=4)

Leave-one-out log predictive densities

p8elpds <- p8 + geom_text(data=fake[seq(1,20,by=2),], aes(x=x, y=rep(8.6,10)), label=signif((loo_1$pointwise[seq(1,20,by=2),1]),2),
               color="blue") +
    geom_segment(data=fake[seq(1,20,by=2),], aes(x=x, y=y, xend=x, yend=rep(8.6,10)),
                 linetype=3, size=0.2) +
    geom_text(data=fake[seq(2,20,by=2),], aes(x=x, y=rep(8.9,10)), label=signif((loo_1$pointwise[seq(2,20,by=2),1]),2),
               color="blue") +
    geom_segment(data=fake[seq(2,20,by=2),], aes(x=x, y=y, xend=x, yend=rep(8.9,10)),
                 linetype=3, size=0.2) +
    ggtitle("Leave-one-out log predictive densities")
p8elpds

Highlight 18th data point in second data set

p6b <- p5b +
    geom_point(data=fake2[18,], color = "forestgreen", size = 5, shape=1)
p6b

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).

Plot leave-one-out predictive mean for second data set

p7b <- p6b +
  geom_abline(
    intercept = mean(sims2loo[, 1]),
    slope = mean(sims2loo[, 2]),
    size = 1,
    color = "forestgreen"
  ) +
    ggtitle("Leave-one-out mean")
p7b

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).

Plot leave-one-out predictive distribution for second data set

p8b <- p7b +
    geom_path(data=condpred2loo,aes(x=x,y=y), color="forestgreen") + geom_vline(xintercept=18, linetype=2, color="forestgreen") +
    ggtitle("Leave-one-out predictive distribution")
p8b

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).

## Warning: Removed 10 rows containing missing values (geom_path).

Plot data

pd <- ggplot(fake, aes(x = x, y = y)) +
    xlim(0,21) + ylim(0,9) +
  geom_point(color = "white", size = 3) +
    geom_point(color = "black", size = 2) +
    ggtitle("Data")
pd

ggsave('figs/fakedata.pdf', pd, width=6, height=4)

Plot posterior draws

pdl <- pd + 
    geom_abline(
        intercept = sims[seq(1,4000,by=10), 1],
        slope = sims[seq(1,4000,by=10), 2],
        size = 0.1,
        color = "blue",
        alpha = 0.1
    )  + ggtitle("Posterior draws")
pdl

ggsave('figs/fakedraws.pdf', pdl, width=6, height=4)

Add posterior predictive distribution

pdl2 <- pdl + geom_path(data=condpred,aes(x=x,y=y), color="red") +
    geom_vline(xintercept=18, linetype=2, color="red") +
    ggtitle("Posterior predictive distribution")
pdl2

ggsave('figs/fakepostpred.pdf', pdl2, width=6, height=4)

Highlight 18th data point

pdl3 <- pdl2 + geom_point(data=fake[18,], color = "forestgreen", size = 5, shape=1)
pdl3

ggsave('figs/fakepostpred18.pdf', pdl3, width=6, height=4)

Plot PSIS-LOO weighted draws

q<-exp(loo_1$psis_object$log_weights[seq(1,4000,by=10),18])
q<-q/max(q)
p4sl <- pd +
    geom_abline(
        intercept = sims[seq(1,4000,by=10), 1],
        slope = sims[seq(1,4000,by=10), 2],
        size = 0.1,
        color = "blue",
        alpha = q
    ) +
    geom_point(data=fake[18,], color = "forestgreen", size = 5, shape=1) +
    ggtitle("PSIS-LOO weighted draws")
p4sl

ggsave('figs/fakepsisdraws.pdf', p4sl, width=6, height=4)

Plot LOO predictive distribution

p4sl2 <- p4sl + geom_path(data=condpred,aes(x=x,y=y), color="red") +
    geom_vline(xintercept=18, linetype=2, color="red") +
    geom_path(data=condpredloo,aes(x=x,y=y), color="forestgreen") +
    geom_vline(xintercept=18, linetype=2, color="forestgreen") +
    ggtitle("PSIS-LOO weighted predictive distribution")
p4sl2

ggsave('figs/fakepsispostpred.pdf', p4sl2, width=6, height=4)

Plot 400 PSIS-LOO weights

q<-exp(loo_1$psis_object$log_weights[seq(1,4000,by=10),18]);
q<-q/sum(q)
pw <- mcmc_hist(data.frame(weight=q),  binwidth=1/4000) +
    labs(title="400 importance weights for leave-18th-out", x="w_i") +
    geom_vline(xintercept=1/400, linetype=2, color="red") +
    geom_text(x=1/400, y=60, hjust="left", label=" Equal weights", fontface="plain", size=5)
pw

ggsave('figs/fakepsisweights.pdf', pw, width=6, height=4)

Plot 4000 PSIS-LOO weights

q<-exp(loo_1$psis_object$log_weights[seq(1,4000,by=1),18]);
q<-q/sum(q)
pw <- mcmc_hist(data.frame(weight=q),  binwidth=1/40000) +
    labs(title="4000 importance weights for leave-18th-out", x="w_i") +
    geom_vline(xintercept=1/4000, linetype=2, color="red") +
    geom_text(x=1/4000, y=550, hjust="left", label=" Equal weights", fontface="plain", size=5)
pw

ggsave('figs/fakepsisweights4000.pdf', pw, width=6, height=4)

Plot PSIS-LOO Pareto-k diagnostics

pkdf<-data.frame(pk=loo_1$diagnostics$pareto_k,
                 neff=loo_1$diagnostics$n_eff,
                 n=1:20)
ggplot(pkdf, aes(x=n,y=pk)) + geom_point(shape=3, color="blue" ,size=3) +
    labs(x="Observation left out", y="Pareto k") +
    geom_hline(yintercept = 0, linetype=3, color="red", size=0.2) +
    geom_hline(yintercept = 0.5, linetype=4, color="red", size=0.2) +
    geom_hline(yintercept = 0.7, linetype=2, color="red", size=0.2) +
    scale_y_continuous(breaks=c(0, 0.25, 0.5, 0.7), limits=c(-0.1,0.7)) +
    ggtitle("PSIS-LOO diagnostics")

ggsave('figs/fakepks.pdf', width=6, height=4)

Plot PSIS-LOO n_effs’

ggplot(pkdf, aes(x=n,y=neff)) + geom_point(shape=3, color="blue" ,size=3) +
    labs(x="Observation left out", y="n_eff") +
    geom_hline(yintercept = 0, linetype=1, size=0.2) +
    geom_hline(yintercept = 4000, linetype=2, size=0.2) +
    geom_hline(yintercept = 400, linetype=3, color="red", size=0.2) +
    ggtitle("PSIS-LOO diagnostics")

ggsave('figs/fakeneffs.pdf', width=6, height=4)

Illustration of fixed/design x

pd <- ggplot(fake, aes(x = x, y = y)) +
    xlim(0,21) + ylim(0,9) +
    geom_point(color = "white", size = 3) +
    geom_point(color = "black", size = 2) +
    geom_vline(xintercept=1:20, linetype=3) +
    labs(title = "Fixed / designed x")
pd

ggsave('figs/fakedfixed.pdf', pd, width=6, height=4)

Illustration of x with distribution

xd<-data.frame(x=seq(0, 21, length.out=100))
xd$y=dnorm(xd$x,10,5)*20
pd <- ggplot(faker, aes(x = x, y = y)) +
    xlim(0,21) + ylim(0,9) +
    geom_point(color = "white", size = 3) +
    geom_point(color = "black", size = 2) +
    geom_vline(xintercept=x, linetype=3) +
    labs(title = "Distribution for x") +
    geom_path(data=xd, aes(x=x,y=y))
pd

ggsave('figs/fakedrandom.pdf', pd, width=6, height=4)

Lake Huron

N <- length(LakeHuron)

Plot data with no year

lake<-data.frame(x=1:N,y=LakeHuron)
pl <- ggplot(lake[1:50,], aes(x = x, y = y)) +
    geom_point(color = "black", size = 3) +
    ggtitle(" ")
pl

ggsave('figs/lake1data.pdf', pl, width=6, height=4)

Fit gp-spline model

lake$ycen<-(lake$y-mean(lake$y))/sd(lake$y)
lake$xcen<-(lake$x-mean(lake$x))/sd(lake$x)
fitl <- stan_gamm4(ycen ~ xcen + s(xcen, bs="gp"), data=lake[1:50,])

## Warning: There were 13 divergent transitions after warmup. Increasing adapt_delta above 0.95 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

## Warning: Examine the pairs() plot to diagnose sampling problems

predl <- as.data.frame(posterior_linpred(fitl, draws=100))
predl$iter <- as.numeric(row.names(predl))
predl<-mutate(gather(as.data.frame(predl), "x", "y", -iter),
              x=as.numeric(x),
              y=y*sd(lake$y)+mean(lake$y))

Plot nonlinear model fit

pl <- ggplot(lake[1:50,], aes(x=x, y=y)) +
    geom_point(color = "black", size = 3) +
    geom_line(data=predl, aes(x=x, y=y, group=iter), size=0.1, alpha=0.3,
              color="blue") +
    ggtitle("Nonlinear model fit")
pl

ggsave('figs/lake1gp.pdf', pl, width=6, height=4)

Plot gp-spline model + test data

pl + geom_point(data=lake[51:80,], color = "black", size = 3, shape=1) +
    ggtitle("Nonlinear model fit + new data")

ggsave('figs/lake1gpptest.pdf', width=6, height=4)

Plot with years

lake<-data.frame(x=1875:1972,y=LakeHuron)
pl <- ggplot(lake, aes(x = x, y = y)) +
    geom_point(color = "black", size = 3) +
    labs(y="Level (feet)", x="Year")    
pl

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

ggsave('figs/lake2data.pdf', pl, width=6, height=4)

## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

Illustrate LOO for time serie

pls <- list()
for (i in 1:4) {
    pls[[i]] <- ggplot(lake[-(50+i),], aes(x = x, y = y)) +
        geom_point(color = "black", size = 1) +
        geom_point(data=lake[50+i,], size = 3, shape=1) +
        xlim(1875,1943) +
        labs(y=NULL, x=NULL)
    if (i>=3)
        pls[[i]] <- pls[[i]] + xlab("Year")
    if ((i %% 2) ==1)
        pls[[i]] <- pls[[i]] + ylab("Level (feet)")
}
bp<-bayesplot_grid(plots=pls, save_gg_objects=TRUE)

## Warning: Removed 29 rows containing missing values (geom_point).

## Warning: Removed 29 rows containing missing values (geom_point).

## Warning: Removed 29 rows containing missing values (geom_point).

## Warning: Removed 29 rows containing missing values (geom_point).

bp

ggsave('figs/lake3loo.pdf', bp, width=6, height=4)

Illustrate balanced kfold approximation of LOO

pl <- ggplot(lake[1:59,][-seq(1,59,by=10),], aes(x = x, y = y)) +
    geom_point(color = "black", size = 2) +
    geom_point(data=lake[seq(1,59,by=10),], color = "black", size = 5, shape=1) +
    labs(y="Level (feet)", x="Year", title="Balance k-fold approximation of LOO")    
pl2 <- ggplot(lake[1:59,][-seq(2,59,by=10),], aes(x = x, y = y)) +
    geom_point(color = "black", size = 2) +
    geom_point(data=lake[seq(2,59,by=10),], color = "black", size = 5, shape=1) +
    labs(y="Level (feet)", x="Year", title="Balance k-fold approximation of LOO")    
bp<-bayesplot_grid(pl, pl2)
pl

pl2

ggsave('figs/lake3kfoldbal.pdf', bp, width=6, height=4)
ggsave('figs/lake3kfoldbal1.pdf', pl, width=6, height=4)
ggsave('figs/lake3kfoldbal2.pdf', pl2, width=6, height=4)

Ilustrate random kfold approximation of LOO

set.seed(12345) 
ii <- sample(1:59,6)
pl <- ggplot(lake[1:59,][-ii,], aes(x = x, y = y)) +
    geom_point(color = "black", size = 2) +
    geom_point(data=lake[ii,], color = "black", size = 5, shape=1) +
    labs(y="Level (feet)", x="Year", title="Random k-fold approximation of LOO")    
pl

ggsave('figs/lake3kfoldrand.pdf', width=6, height=4)

Illustrate 1-step-ahead cross-validation

pls <- list()
for (i in 1:4) {
    pls[[i]] <- ggplot(lake[1:(49+i),], aes(x = x, y = y)) +
        geom_point(color = "black", size = 1) +
        geom_point(data=lake[50+i,], size = 3, shape=1) +
        xlim(1875,1943) +
        labs(y=NULL, x=NULL)
    if (i>=3)
        pls[[i]] <- pls[[i]] + xlab("Year")
    if ((i %% 2) ==1)
        pls[[i]] <- pls[[i]] + ylab("Level (feet)")
}
bp<-bayesplot_grid(plots=pls)
ggsave('figs/lake3stepahead.pdf', bp, width=6, height=4)

Illustrate 10-step-ahead cross-validation

pls <- list()
for (i in 1:4) {
    pls[[i]] <- ggplot(lake[1:(49+i),], aes(x = x, y = y)) +
        geom_point(color = "black", size = 1) +
        geom_point(data=lake[i+(50:59),], size = 3, shape=1) +
        xlim(1875,1943) +
        labs(y=NULL, x=NULL)
    if (i>=3)
        pls[[i]] <- pls[[i]] + xlab("Year")
    if ((i %% 2) ==1)
        pls[[i]] <- pls[[i]] + ylab("Level (feet)")
}
bp<-bayesplot_grid(plots=pls)
ggsave('figs/lake3tenstepahead.pdf', bp, width=6, height=4)

Illustrate 1-step-ahead with remove a block cross-validation

pls <- list()
for (i in 1:4) {
    pls[[i]] <- ggplot(lake[-(i+(50:59)),], aes(x = x, y = y)) +
        geom_point(color = "black", size = 1) +
        geom_point(data=lake[i+50,], size = 3, shape=1) +
        xlim(1875,1943) +
        labs(y=NULL, x=NULL)
    if (i>=3)
        pls[[i]] <- pls[[i]] + xlab("Year")
    if ((i %% 2) ==1)
        pls[[i]] <- pls[[i]] + ylab("Level (feet)")
}
bp<-bayesplot_grid(plots=pls)

## Warning: Removed 29 rows containing missing values (geom_point).

## Warning: Removed 29 rows containing missing values (geom_point).

## Warning: Removed 29 rows containing missing values (geom_point).

## Warning: Removed 29 rows containing missing values (geom_point).

ggsave('figs/lake3stepaheadblock.pdf', bp, width=6, height=4)

Illustrate Marginal likelihood / Bayes factor

pls <- list()
for (i in 1:4) {
    pls[[i]] <- ggplot(lake[0:(i-1),], aes(x = x, y = y)) +
        geom_point(color = "black", size = 1) +
        geom_point(data=lake[i,], size = 3, shape=1) +
        xlim(1875,1943) + ylim(579.5, 582.5) +
        labs(y=NULL, x=NULL)
    if (i>=3)
        pls[[i]] <- pls[[i]] + xlab("Year")
    if ((i %% 2) ==1)
        pls[[i]] <- pls[[i]] + ylab("Level (feet)")
}
bp<-bayesplot_grid(plots=pls)
ggsave('figs/lake3bf.pdf', bp, width=6, height=4)

PSIS for m-step

Plot whole data

N <- length(LakeHuron)
df <- data.frame(y = as.numeric(LakeHuron), year=1875:1972, time=1:N)
ggplot(df, aes(x=year, y=y)) + geom_point() +
    labs(y="Level (feet)", x="Year", title="Data")

ggsave('figs/lake4data.pdf', width=6, height=4)

Fit AR-2 model

control <- list(adapt_delta = 0.95)
SEED<-1234

fit <- brm(y ~ 1, data = df, autocor = cor_ar(~time, p = 2), 
           control = control, seed = SEED)

## Compiling the C++ model

## Start sampling

pred <- cbind(df, predict(fit))

AR-2 prediction with 95% interval

ggplot(pred, aes(year, Estimate)) +
  geom_smooth(
    aes(ymin = Q2.5, ymax = Q97.5),
    stat = "identity"
  ) +
  geom_point(aes(year, y), inherit.aes = FALSE) +
  labs(x="Year", title = "AR-2 prediction with 95% interval")

ggsave('figs/lake4pred.pdf', width=6, height=4)

PSIS-m-step-ahead

L<-20
approx_elpds_1sap_no_refit <- rep(NA, nrow(df))
loglik <- log_lik(fit)
logr <- matrix(nrow = nsamples(fit), ncol = nrow(df))
for (i in N:(L + 1)) {
  logr[, i] <- - rowSums(loglik[, i:N, drop = FALSE])
  psis_part <- suppressWarnings(psis(logr[, i:N]))
  w_i <- exp(psis_part$log_weights[, 1])
  approx_elpds_1sap_no_refit[i] <- 
    log(sum(exp(loglik[, i]) * w_i) / sum(w_i))
}

Plot PSIS-m-step-ahead

is_na <- apply(logr, 2, anyNA)
ps <- psis(logr[, !is_na])

## Warning: Relative effective sample sizes ('r_eff' argument) not specified.
## PSIS n_eff will not adjusted based on MCMC n_eff.

## Warning: Some Pareto k diagnostic values are too high. See help('pareto-k-diagnostic') for details.

pkdf <- data.frame(pk <- ps$diagnostics$pareto_k, n= ((L+1):N)+1875)
ggplot(pkdf, aes(x=n,y=pk)) + geom_point(shape=3, color="blue") +
    labs(x="Year", y="Pareto k", title="PSIS-1-step-ahead") +
    geom_hline(yintercept = 0, linetype=3, color="red", size=0.2) +
    geom_hline(yintercept = 0.5, linetype=4, color="red", size=0.2) +
    geom_hline(yintercept = 0.7, linetype=2, color="red", size=0.2) +
    geom_hline(yintercept = 1, color="red", size=0.2) +
    ylim(-0.3,1.5)

ggsave('figs/lake4psismstep.pdf', width=6, height=4)

PSIS-m-step-ahead with refits

k_thres<-0.65
log_mean_exp <- function(x) {
  # more stable than log(mean(exp(x)))
  max_x <- max(x)
  max_x + log(sum(exp(x - max_x))) - log(length(x))
}
loglik <- logr <- matrix(nrow = nsamples(fit), ncol = nrow(df))
ks <- approx_elpds_1sap <- rep(NA, nrow(df))
fit_part <- fit
i_refit <- N
refits <- NULL
for (i in N:(L + 1)) {
  loglik[, i] <- log_lik(fit_part)[, i]
  logr[, i] <- - rowSums(loglik[, i:i_refit, drop = FALSE])
  psis_part <- suppressWarnings(psis(logr[, i]))
  ks[i]<-psis_part$diagnostics$pareto_k
  if (any(psis_part$diagnostics$pareto_k > k_thres)) {
    # refit the model based on the first i-1 observations
    i_refit <- i
    refits <- c(refits, i)
    fit_part <- update(
      fit_part, newdata = df[1:(i-1), ], 
      recompile = FALSE, chains = 1, seed = SEED
    )
    loglik[, i] <- log_lik(fit_part, newdata = df[1:i, ])[, i]
    logr[, i] <- - rowSums(loglik[, i:i_refit, drop = FALSE])
    approx_elpds_1sap[i] <- log_mean_exp(loglik[, i])
  } else {
    w_i <- exp(psis_part$log_weights[, 1])
    approx_elpds_1sap[i] <- log(sum(exp(loglik[, i]) * w_i) / sum(w_i))
  }
}

## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling
## Start sampling

Plot PSIS-m-step-ahead with refits

is_na <- apply(logr, 2, anyNA)
pkdf<-data.frame(pk=ks[(L+1):N],n= ((L+1):N)+1875)
ggplot(pkdf, aes(x=n,y=pk)) + geom_point(shape=3, color="blue") +
    labs(x="Year", y="Pareto k", title="PSIS-1-step-ahead with refits") +
    geom_hline(yintercept = 0, linetype=3, color="red", size=0.2) +
    geom_hline(yintercept = 0.5, linetype=4, color="red", size=0.2) +
    geom_hline(yintercept = 0.7, linetype=2, color="red", size=0.2) +
    geom_hline(yintercept = 1, color="red", size=0.2) +
    geom_point(data=pkdf[refits-L,], shape=1, size=5, color="blue") +
    ylim(-0.3,1.5)

ggsave('figs/lake4psisrefits.pdf', width=6, height=4)

Rats

Load data

sourceToList = function(file){
  source(file, local = TRUE)
  d = mget(ls())
  d$file = NULL
  d
}
#
rats = sourceToList("rats.data.R")
rats = with(rats, list(
  N = N,
  Npts = length(y),
  rat = rep(1:nrow(y), ncol(y)),
  x = rep(x, each = nrow(y)),
  y = as.numeric(y),
  xbar = xbar
))
ratsdf <- with(rats, data.frame(x=x, y=y, rat=rat))

Plot rats data

pr <- ggplot(data=ratsdf, aes(x=x, y=y)) +
    geom_line(aes(group=rat), color="black", size=0.1) +
    geom_point(color="black", size=2) +
    labs(x="Age", y="Weight", title="Rats data")
pr

ggsave('figs/rats1data.pdf', pr, width=6, height=4)

Leave-one-out?

pr1 <- pr +
    geom_point(data=ratsdf[69,], color="red", size=5, shape=1) +
    ggtitle('Leave-one-out?')
pr1

ggsave('figs/rats1loo.pdf', pr1, width=6, height=4)

1-step-ahead?

pr1 <- pr +
    geom_point(data=ratsdf[121:150,], color="red", size=5, shape=1) +
    ggtitle('1-step-ahead?')
pr1

ggsave('figs/rats1step.pdf', pr1, width=6, height=4)

Leave-one-time-point-out?

pr1 <- pr +
    geom_point(data=ratsdf[61:90,], color="red", size=5, shape=1) +
    ggtitle('Leave-one-time-point-out?')
pr1

ggsave('figs/rats1onetime.pdf', pr1, width=6, height=4)

Leave-one-rat-out?

pr1 <- pr +
    geom_point(data=ratsdf[seq(9,129,by=30),], color="red", size=5, shape=1) +
    ggtitle('Leave-one-rat-out?')
pr1

ggsave('figs/rats1onerat.pdf', pr1, width=6, height=4)

## pr1 <- pr +
##     geom_point(data=ratsdf[seq(9,129,by=30),], color="red", size=5, shape=1) +
##     ggtitle('Leave-one-rat-out')
## pr1
## ggsave('figs/rats1oneratb.pdf', pr1, width=6, height=4)

Predict given initial weight?

pr1 <- pr +
    geom_point(data=ratsdf[seq(39,129,by=30),], color="red", size=5, shape=1) +
    ggtitle('Predict given initial weight?')
pr1

ggsave('figs/rats1init.pdf', pr1, width=6, height=4)

Random kfold approximation of LOO?

pr1 <- pr +
    geom_point(data=ratsdf[sample(1:150,15),], color="red", size=5, shape=1) +
    ggtitle('Random kfold approximation of LOO')
pr1

ggsave('figs/rats1kfoldrand.pdf', pr1, width=6, height=4)

Primate milk

A popular hypothesis has it that primates with larger brains produce more energetic milk, so that brains can grow quickly.

Load data

data(milk)
d <- milk[complete.cases(milk),]
d$neocortex <- d$neocortex.perc /100
str(d)

## 'data.frame':    17 obs. of  9 variables:
##  $ clade         : Factor w/ 4 levels "Ape","New World Monkey",..: 4 2 2 2 2 2 2 2 3 3 ...
##  $ species       : Factor w/ 29 levels "A palliata","Alouatta seniculus",..: 11 2 1 6 27 5 3 4 21 19 ...
##  $ kcal.per.g    : num  0.49 0.47 0.56 0.89 0.92 0.8 0.46 0.71 0.68 0.97 ...
##  $ perc.fat      : num  16.6 21.2 29.7 53.4 50.6 ...
##  $ perc.protein  : num  15.4 23.6 23.5 15.8 22.3 ...
##  $ perc.lactose  : num  68 55.2 46.9 30.8 27.1 ...
##  $ mass          : num  1.95 5.25 5.37 2.51 0.68 0.12 0.47 0.32 1.55 3.24 ...
##  $ neocortex.perc: num  55.2 64.5 64.5 67.6 68.8 ...
##  $ neocortex     : num  0.552 0.645 0.645 0.676 0.688 ...

Fit various models

fit1 <- stan_glm(kcal.per.g ~ 1, data = d, seed = 2030)
fit2 <- update(fit1, formula = kcal.per.g ~ neocortex)
fit3 <- update(fit1, formula = kcal.per.g ~ log(mass))
fit4 <- update(fit1, formula = kcal.per.g ~ neocortex + log(mass))

Compute LOO

loo1 <- loo(fit1)
loo2 <- loo(fit2)
loo3 <- loo(fit3)
loo4 <- loo(fit4)
lpd_point <- cbind(
  loo1$pointwise[,"elpd_loo"], 
  loo2$pointwise[,"elpd_loo"],
  loo3$pointwise[,"elpd_loo"], 
  loo4$pointwise[,"elpd_loo"]
)

Plot elpd_loo’s

lols <- data.frame(loo1=loo1$pointwise[,1], loo2=loo2$pointwise[,1],
                   loo3=loo3$pointwise[,1], loo4=loo4$pointwise[,1],
                   n=1:17)

pl <- ggplot(lols, aes(x=n)) +
    geom_point(aes(y=loo2), color = "blue", size = 3) +
    geom_point(aes(y=loo4), color = "red", size = 3) +
    labs(x="Observation", y="elpd_loo", title="Pointwise comparison LOO models: Model 1 = blue, Model 2 = red")
pl

ggsave('figs/milkelpdloo.pdf', width=6, height=4)

Plot elpd_loo SE’s

pl + geom_segment(aes(xend=n, y=loo2, yend=loo4), linetype=3)

ggsave('figs/milkelpdloo2.pdf', width=6, height=4)

Plot elpd_diff’s

pl <- ggplot(lols, aes(x=n)) +
    geom_point(aes(y=loo4-loo2), color = "violet", size = 3) +
    geom_hline(yintercept = 0, linetype = 2) +
    labs(x="Observation", y="Pointwise elpd_diff", title="Pointwise comparison LOO models") 
pl

ggsave('figs/milkelpddiff.pdf', width=6, height=4)

Basics of Bayesian cross-validation for model assessment and selection

Aki Vehtari

2018-08-29

Simple fake data

Lake Huron

PSIS for m-step

Rats

Primate milk