Load packages
library(rstan)
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans"))
library(loo)
library(rprojroot)
root<-has_file(".BDA_R_demos_root")$make_fix_file()
source(root("demos_rstan/ppc", "count-data.R"))
print(N)
[1] 500
print(y)
[1] 0 3 5 0 4 7 4 2 3 6 7 0 0 3 7 5 5 0 4 0 4 4 6 3 7 5 3 0 0 2 0 1 0 1 5 4 4
[38] 2 3 6 4 5 0 7 7 4 4 4 0 6 1 5 6 5 6 7 3 6 2 3 0 2 0 6 6 0 3 4 4 5 5 0 5 7
[75] 5 5 6 4 2 3 4 6 4 6 6 4 0 6 5 5 7 0 1 6 7 0 5 0 0 5 6 5 1 0 7 1 2 6 5 4 0
[112] 4 0 4 4 6 3 0 0 3 3 4 2 5 3 4 3 2 5 2 4 4 0 2 7 5 7 5 5 7 7 0 4 6 0 4 6 7
[149] 4 0 4 1 5 0 3 5 7 6 0 5 5 6 7 6 7 3 4 3 7 7 2 5 4 5 5 0 6 2 4 5 4 0 0 5 5
[186] 7 7 0 3 0 3 3 6 1 4 2 0 4 7 5 5 0 3 7 0 6 6 4 1 6 7 6 0 3 6 4 7 0 5 5 4 0
[223] 0 2 4 6 0 5 0 2 7 2 7 5 4 6 2 4 0 4 0 0 3 5 4 3 5 5 7 7 0 6 4 5 1 5 3 5 5
[260] 5 0 2 7 6 2 3 2 5 4 7 6 7 3 3 4 4 6 4 6 7 1 5 6 3 3 6 3 4 0 7 0 3 6 5 0 0
[297] 0 5 4 4 0 4 7 5 5 3 3 0 0 5 4 0 7 6 0 6 2 0 6 1 0 4 0 4 3 0 4 5 5 7 6 6 5
[334] 4 7 0 6 4 7 7 5 0 1 4 7 6 4 5 4 7 2 5 2 6 3 2 7 4 3 4 6 6 6 6 7 1 0 0 7 7
[371] 4 2 4 5 5 7 4 1 7 6 5 6 5 4 0 0 7 0 0 5 6 6 3 6 0 0 0 4 4 3 0 7 5 4 2 7 0
[408] 4 0 0 2 4 5 0 4 2 5 2 0 6 6 3 6 0 2 5 0 0 0 6 0 0 6 5 4 6 4 5 5 4 0 3 4 3
[445] 3 5 3 4 5 7 0 0 1 4 6 3 5 7 6 6 5 0 5 4 0 0 2 6 0 6 0 4 5 6 3 4 2 3 4 0 5
[482] 0 0 0 0 3 4 7 6 7 7 3 4 4 7 4 5 2 5 6
qplot(y)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
x <- rpois(N, mean(y))
qplot(x)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plotdata <- data.frame(
value = c(y, x),
variable = rep(c("Our data", "Poisson data"), each = N)
)
ggplot(plotdata, aes(x = value, color = variable)) +
geom_freqpoly(binwidth = 0.5) +
scale_x_continuous(name = "", breaks = 0:max(x,y)) +
scale_color_manual(name = "", values = c("gray30", "purple"))
Even though we already suspect it won’t be a good model for this data, it’s still a good idea to start by fitting the simplest Poisson model. From there we can then identify in which ways the model is inadequate.
code_poisson <- root("demos_rstan/ppc", "poisson-simple.stan")
writeLines(readLines(code_poisson))
data {
int<lower=1> N;
array[N] int<lower=0> y;
}
parameters {
real<lower=0> lambda;
}
model {
lambda ~ exponential(0.2);
y ~ poisson(lambda);
}
generated quantities {
array[N] real log_lik;
array[N] int y_rep;
for (n in 1 : N) {
y_rep[n] = poisson_rng(lambda);
log_lik[n] = poisson_lpmf(y[n] | lambda);
}
}
fit <- stan(file = code_poisson, data = c("y", "N"), refresh = 0)
monitor(as.array(fit)[,,c("lambda","lp__")])
Inference for the input samples (4 chains: each with iter = 1000; warmup = 500):
Q5 Q50 Q95 Mean SD Rhat Bulk_ESS Tail_ESS
lambda 3.5 3.7 3.8 3.7 0.1 1 863 1060
lp__ 544.4 546.0 546.2 545.7 0.7 1 946 1365
For each parameter, Bulk_ESS and Tail_ESS are crude measures of
effective sample size for bulk and tail quantities respectively (an ESS > 100
per chain is considered good), and Rhat is the potential scale reduction
factor on rank normalized split chains (at convergence, Rhat <= 1.05).
color_scheme_set("brightblue") # check out ?bayesplot::color_scheme_set
lambda_draws <- as.matrix(fit, pars = "lambda")
mcmc_areas(lambda_draws, prob = 0.8) # color 80% interval
print(fit, pars = "lambda")
Inference for Stan model: anon_model.
4 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
lambda 3.67 0 0.08 3.51 3.61 3.67 3.72 3.83 1513 1
Samples were drawn using NUTS(diag_e) at Wed Oct 26 15:42:41 2022.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
mean(y)
[1] 3.662
The model gets the mean right, but, as we’ll see next, the model is quite bad at predicting the outcome.
y_rep
draws from the fitted model objecty_rep <- as.matrix(fit, pars = "y_rep")
# number of rows = number of post-warmup posterior draws
# number of columns = length(y)
dim(y_rep)
[1] 4000 500
y
to histograms of several y_rep
sppc_hist(y, y_rep[1:8, ], binwidth = 1)
y
to density estimates of a bunch of
y_rep
sppc_dens_overlay(y, y_rep[1:50, ])
Rootogram shows the square root of counts, with histogram for the observed data and and model predictions (yrep) as a ribbon
ppc_rootogram(y, y_rep)
y
to the distribution of that proportion over all
y_rep
sprop_zero <- function(x) mean(x == 0)
print(prop_zero(y))
[1] 0.202
ppc_stat(y, y_rep, stat = "prop_zero")
Look at two statistics in a scatterplot:
ppc_stat_2d(y, y_rep, stat = c("mean", "sd"))
Distributions of predictive errors:
ppc_error_hist(y, y_rep[1:4, ], binwidth = 1) + xlim(-15, 15)
Warning: Removed 8 rows containing missing values (geom_bar).
This model says that there is some probability theta
that y
is zero and probability 1 - theta
that
y
is positive. Conditional on observing a positive
y
, we use a truncated Poisson
y[n] ~ Poisson(lambda) T[1, U];
where T[1,U]
indicates truncation with lower bound
1
and upper bound U
, which for simplicity
we’ll assume is max(y)
.
code_poisson_hurdle <- root("demos_rstan/ppc", "poisson-hurdle.stan")
writeLines(readLines(code_poisson_hurdle))
/* Poisson "hurdle" model (also with upper truncation)
*
* Note: for simplicity I assume that there is only one way to obtain
* a zero, as opposed to some zero-inflated models where there are
* multiple processes that lead to a zero. So in this example, y is
* zero with probability theta and y is modeled as a truncated poisson
* if greater than zero.
*/
data {
int<lower=1> N;
array[N] int<lower=0> y;
}
transformed data {
int U = max(y); // upper truncation point
}
parameters {
real<lower=0, upper=1> theta; // Pr(y = 0)
real<lower=0> lambda; // Poisson rate parameter (if y > 0)
}
model {
lambda ~ exponential(0.2);
for (n in 1 : N) {
if (y[n] == 0) {
target += log(theta); // log(Pr(y = 0))
} else {
target += log1m(theta); // log(Pr(y > 0))
y[n] ~ poisson(lambda) T[1, U]; // truncated poisson
}
}
}
generated quantities {
array[N] real log_lik;
array[N] int y_rep;
for (n in 1 : N) {
if (bernoulli_rng(theta)) {
y_rep[n] = 0;
} else {
// use a while loop because Stan doesn't have truncated RNGs
int w; // temporary variable
w = poisson_rng(lambda);
while (w == 0 || w > U) {
w = poisson_rng(lambda);
}
y_rep[n] = w;
}
if (y[n] == 0) {
log_lik[n] = log(theta);
} else {
log_lik[n] = log1m(theta) + poisson_lpmf(y[n] | lambda)
- log_diff_exp(poisson_lcdf(U | lambda),
poisson_lcdf(0 | lambda));
}
}
}
fit2 <- stan(file = code_poisson_hurdle, data = c("y", "N"), refresh = 0)
print(fit2, pars = c("lambda", "theta"))
Inference for Stan model: anon_model.
4 chains, each with iter=2000; warmup=1000; thin=1;
post-warmup draws per chain=1000, total post-warmup draws=4000.
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
lambda 5.3 0 0.16 5.00 5.19 5.3 5.41 5.63 3335 1
theta 0.2 0 0.02 0.17 0.19 0.2 0.22 0.24 2839 1
Samples were drawn using NUTS(diag_e) at Wed Oct 26 15:44:34 2022.
For each parameter, n_eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor on split chains (at
convergence, Rhat=1).
lambda_draws2 <- as.matrix(fit2, pars = "lambda")
lambdas <- cbind(lambda_fit1 = lambda_draws[, 1],
lambda_fit2 = lambda_draws2[, 1])
mcmc_areas(lambdas, prob = 0.8) # color 80% interval
Same plots as before (and a few others), but this time using
y_rep
from fit2
. Everything looks much more
reasonable:
y_rep2 <- as.matrix(fit2, pars = "y_rep")
ppc_hist(y, y_rep2[1:8, ], binwidth = 1)
ppc_dens_overlay(y, y_rep2[1:50, ])
ppc_rootogram(y, y_rep2)
ppc_stat(y, y_rep2, stat = "prop_zero")
ppc_stat_2d(y, y_rep2, stat = c("mean", "sd"))
ppc_error_hist(y, y_rep2[sample(nrow(y_rep2), 4), ], binwidth = 1)
Compare predictive performance with LOO.
(loo1 <- loo(fit))
Computed from 4000 by 500 log-likelihood matrix
Estimate SE
elpd_loo -1207.9 17.5
p_loo 1.4 0.1
looic 2415.7 35.0
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
(loo2 <- loo(fit2))
Computed from 4000 by 500 log-likelihood matrix
Estimate SE
elpd_loo -992.0 9.3
p_loo 2.1 0.1
looic 1984.1 18.6
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
loo_compare(loo1, loo2)
elpd_diff se_diff
model2 0.0 0.0
model1 -215.8 22.1
Model 2 is a clear winner in the predictive performance.