Code for producing figures in StanCon 2018 Helsinki tutorial


Load packages

library(dplyr)
library(tidyr)
library("rstanarm")
options(mc.cores = parallel::detectCores())
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans", base_size=16))
library(brms)
library(loo)

Simple fake data

Simulate fake data

x <- 1:20
n <- length(x)
a <- 0.2
b <- 0.3
sigma <- 1
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(2141) 
y <- a + b*x + sigma*rnorm(n)
fake <- data.frame(x, y)
# another random seed for second data realisation
set.seed(12241) 
y2 <- a + b*x + sigma*rnorm(n)
fake2 <- data.frame(x, y=y2)
# data with random x
set.seed(2141) 
x <- rnorm(20, 10, 5)
y <- a + b*x + sigma*rnorm(n)
faker <- data.frame(x, y)

Fit linear model

fit_1 <- stan_glm(y ~ x, data = fake, seed=2141)
fit_2 <- stan_glm(y ~ x, data = fake2, seed=2141)
print(fit_1, digits=2)
## stan_glm
##  family:       gaussian [identity]
##  formula:      y ~ x
##  observations: 20
##  predictors:   2
## ------
##             Median MAD_SD
## (Intercept) 0.60   0.46  
## x           0.26   0.04  
## 
## Auxiliary parameter(s):
##       Median MAD_SD
## sigma 0.98   0.17  
## 
## Sample avg. posterior predictive distribution of y:
##          Median MAD_SD
## mean_PPD 3.34   0.31  
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Compute exact leave-one-out cross-validation for i=18

fit_1loo <- stan_glm(y ~ x, data = fake[-18,], seed=2141)
fit_2loo <- stan_glm(y ~ x, data = fake2[-18,], seed=2141)

Extract posterior draws

sims <- as.matrix(fit_1)
sims2 <- as.matrix(fit_2)
simsloo <- as.matrix(fit_1loo)
sims2loo <- as.matrix(fit_2loo)
n_sims <- nrow(sims)

Compute conditional distribution given x=18

cond<-data.frame(y=seq(0,9,length.out=100))
cond$x <- dnorm(cond$y, a + b*18, sigma)*6+18

Compute posterior predictive distribution given x=18

condpred<-data.frame(y=seq(0,9,length.out=100))
condpred$x <- sapply(condpred$y, FUN=function(y) mean(dnorm(y, sims[,1] + sims[,2]*18, sims[,3])*6+18))

Compute posterior predictive distribution given x=18 for second data set

condpred2<-data.frame(y=seq(0,10,length.out=100))
condpred2$x <- sapply(condpred2$y, FUN=function(y) mean(dnorm(y, sims2[,1] + sims2[,2]*18, sims2[,3])*6+18))

Compute LOO posterior predictive distribution given x=18

condpredloo<-data.frame(y=seq(0,9,length.out=100))
condpredloo$x <- sapply(condpredloo$y, FUN=function(y) mean(dnorm(y, simsloo[,1] + simsloo[,2]*18, simsloo[,3])*6+18))

Compute LOO posterior predictive distribution given x=18 for second data set

condpred2loo<-data.frame(y=seq(0,10,length.out=100))
condpred2loo$x <- sapply(condpred2loo$y, FUN=function(y) mean(dnorm(y, sims2loo[,1] + sims2loo[,2]*18, sims2loo[,3])*6+18))

Plot mean of data generating mechanism

p1 <- ggplot(fake, aes(x = x, y = y)) +
    geom_abline(intercept=a, slope=b) +
    xlim(0,21) + ylim(0,9) +
    labs(title = "True mean y = a + bx") 
p1

ggsave('figs/fake1.pdf', p1, width=6, height=4)

Plot data generating mechanism

p2 <- p1 +
    geom_path(data=cond,aes(x=x,y=y)) + geom_vline(xintercept=18, linetype=2) +
    labs(title = "True mean and sigma") 
p2

ggsave('figs/fake2.pdf', p2, width=6, height=4)

Plot one realisation from the data generating mechanism

p3 <- p2 +
  geom_point(color = "white", size = 3) +
  geom_point(color = "black", size = 2) +
  labs(title = "Data") 
p3

ggsave('figs/fake3.pdf', p3, width=6, height=4)

Plot posterior mean

p4 <- p3 +
  geom_abline(
    intercept = mean(sims[, 1]),
    slope = mean(sims[, 2]),
    size = 1,
    color = "red"
  ) +
  labs(title = "Posterior mean") 
p4

ggsave('figs/fake4.pdf', p4, width=6, height=4)

Plot posterior mean given second data set

p4b <- p2 +
  geom_point(data=fake2, color = "white", size = 3) +
  geom_point(data=fake2, color = "black", size = 2) +
  geom_abline(
    intercept = mean(sims2[, 1]),
    slope = mean(sims2[, 2]),
    size = 1,
    color = "red"
  ) +
  labs(title = "Posterior mean, alternative data realisation") 
p4b
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

ggsave('figs/fake4b.pdf', p4b, width=6, height=4)
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

Plot posterior draws

p4s <- p4 +
    geom_abline(
        intercept = sims[seq(1,1001,by=10), 1],
        slope = sims[seq(1,1001,by=10), 2],
        size = 0.1,
        color = "blue",
        alpha = 0.1
    ) + ggtitle("Posterior draws")
p4s