Load packages
library(cmdstanr)
# If running in Aalto JupyterHub the path should automatically be set to
# '/coursedata/cmdstan'
options(mc.cores = 1)
library(posterior)
options(posterior.num_args=list(sigfig=2)) # by default summaries with 2 significant digits
library(loo)
library(tidyr)
library(dplyr)
options(pillar.neg=FALSE)
library(ggplot2)
library(gridExtra)
library(bayesplot)
library(ggdist)
theme_set(bayesplot::theme_default(base_family = "sans"))
library(rprojroot)
root<-has_file(".BDA_R_demos_root")$make_fix_file()
SEED <- 48927 # set random seed for reproducability
This notebook contains several examples of how to use Stan in R with cmdstanr. This notebook assumes basic knowledge of Bayesian inference and MCMC. The Stan models are stored in separate .stan-files. The examples are related to Bayesian data analysis course.
Toy data with sequence of failures (0) and successes (1). We would like to learn about the unknown probability of success.
data_bern <- list(N = 10, y = c(1, 1, 1, 0, 1, 1, 1, 0, 1, 0))
Bernoulli model with a proper Beta(1,1) (uniform) prior
code_bern <- root("demos_rstan", "bern.stan")
writeLines(readLines(code_bern))
// Bernoulli model
data {
int<lower=0> N; // number of observations
array[N] int<lower=0, upper=1> y; // vector of binary observations
}
parameters {
real<lower=0, upper=1> theta; // probability of success
}
model {
// model block creates the log density to be sampled
theta ~ beta(1, 1); // prior
y ~ bernoulli(theta); // observation model / likelihood
// the notation using ~ is syntactic sugar for
// target += beta_lpdf(theta | 1, 1); // lpdf for continuous theta
// target += bernoulli_lpmf(y | theta); // lpmf for discrete y
// target is the log density to be sampled
//
// y is an array of integers and
// y ~ bernoulli(theta);
// is equivalent to
// for (i in 1:N) {
// y[i] ~ bernoulli(theta);
// }
// which is equivalent to
// for (i in 1:N) {
// target += bernoulli_lpmf(y[i] | theta);
// }
}
generated quantities {
vector[N] log_lik;
real lprior = beta_lpdf(theta | 1, 1);
for (i in 1:N) {
log_lik[i] = bernoulli_lpmf(y[i] | theta);
}
}
Sample form the posterior and show the summary
mod_bern <- cmdstan_model(stan_file = code_bern)
fit_bern <- mod_bern$sample(data = data_bern, seed = SEED, refresh=1000)
fit_bern$summary()
Plot a histogram of the posterior draws with bayesplot (uses ggplot)
draws <- fit_bern$draws(format = "df")
mcmc_hist(draws, pars='theta') + xlim(c(0,1))
Plot a dots plot of the posterior draws with ggplot + ggdist
draws |>
ggplot(aes(x=theta)) +
stat_dotsinterval() +
xlim(c(0,1))
Instead of sequence of 0’s and 1’s, we can summarize the data with the number of experiments and the number successes:
data_bin <- list(N = 10, y = 7)
And then we use Binomial model with Beta(1,1) prior for the probability of success.
code_binom <- root("demos_rstan","binom.stan")
writeLines(readLines(code_binom))
// Binomial model with beta(1,1) prior
data {
int<lower=0> N; // number of experiments
int<lower=0> y; // number of successes
}
parameters {
real<lower=0, upper=1> theta; // probability of success in range (0,1)
}
model {
// model block creates the log density to be sampled
theta ~ beta(1, 1); // prior
y ~ binomial(N, theta); // observation model / likelihood
// the notation using ~ is syntactic sugar for
// target += beta_lpdf(theta | 1, 1); // lpdf for continuous theta
// target += binomial_lpmf(y | N, theta); // lpmf for discrete y
// target is the log density to be sampled
}
Sample from the posterior and plot the posterior. The histogram should look similar as in the Bernoulli case.
mod_bin <- cmdstan_model(stan_file = code_binom)
fit_bin <- mod_bin$sample(data = data_bin, seed = SEED, refresh=1000)
fit_bin$summary()
draws <- fit_bin$draws(format = "df")
mcmc_hist(draws, pars = 'theta') + xlim(c(0,1))
Re-run the model with a new data. The compiled Stan program is re-used making the re-use faster.
data_bin <- list(N = 100, y = 70)
fit_bin <- mod_bin$sample(data = data_bin, seed = SEED, refresh=1000)
fit_bin$summary()
draws <- fit_bin$draws(format = "df")
mcmc_hist(draws, pars = 'theta', binwidth = 0.01) + xlim(c(0,1))
In the above examples the probability of success \(\theta\) was declared as
real<lower=0,upper=1> theta;
Stan makes automatic transformation of the variable to the unconstrained space using logit transofrmation for interval constrained and log transformation for half constraints.
The following example shows how we can also make an explicit transformation and use binomial_logit function which takes the unconstrained parameter as an argument and uses logit transformation internally. This form can be useful for better numerical stability.
code_binomb <- root("demos_rstan", "binomb.stan")
writeLines(readLines(code_binomb))
// Binomial model with a roughly uniform prior for
// the probability of success (theta)
data {
int<lower=0> N; // number of experiments
int<lower=0> y; // number of successes
}
parameters {
// sampling is done for the parameters
real alpha; // logit of probability of success in rang (-Inf,Inf)
}
transformed parameters {
// transformed parameters are deterministic transformations of parameters (and data)
real theta = inv_logit(alpha); // probability of success in range (0,1)
}
model {
// model block creates the log density to be sampled
alpha ~ normal(0, 1.5); // roughly uniform prior for theta
y ~ binomial_logit(N, alpha); // model parameterized with logit of probability
// the notation using ~ is syntactic sugar for
// target += normal_lpdf(alpha | 0, 1.5); // lpdf for continuous theta
// target += binomial_logit_lpmf(y | N, alpha); // lpmf for discrete y
// target is the log density to be sampled
}
Here we have used Gaussian prior in the unconstrained space, which produces close to uniform prior for theta.
Sample from the posterior and plot the posterior. The histogram should look similar as with the previous models.
data_bin <- list(N = 100, y = 70)
mod_binb <- cmdstan_model(stan_file = code_binomb)
fit_binb <- mod_bin$sample(data = data_bin, seed = SEED, refresh=1000)
fit_binb$summary()
draws <- fit_binb$draws(format = "df")
mcmc_hist(draws, pars = 'theta', binwidth = 0.01) + xlim(c(0,1))
An experiment was performed to estimate the effect of beta-blockers on mortality of cardiac patients. A group of patients were randomly assigned to treatment and control groups:
Data:
data_bin2 <- list(N1 = 674, y1 = 39, N2 = 680, y2 = 22)
To analyse whether the treatment is useful, we can use Binomial model for both groups and compute odds-ratio:
code_binom2 <- root("demos_rstan", "binom2.stan")
writeLines(readLines(code_binom2))
// Comparison of two groups with Binomial
data {
int<lower=0> N1; // number of experiments in group 1
int<lower=0> y1; // number of deaths in group 1
int<lower=0> N2; // number of experiments in group 2
int<lower=0> y2; // number of deaths in group 2
}
parameters {
real<lower=0, upper=1> theta1; // probability of death in group 1
real<lower=0, upper=1> theta2; // probability of death in group 2
}
model {
// model block creates the log density to be sampled
theta1 ~ beta(1, 1); // prior
theta2 ~ beta(1, 1); // prior
y1 ~ binomial(N1, theta1); // observation model / likelihood
y2 ~ binomial(N2, theta2); // observation model / likelihood
// the notation using ~ is syntactic sugar for
// target += beta_lpdf(theta1 | 1, 1); // lpdf for continuous theta1
// target += beta_lpdf(theta2 | 1, 1); // lpdf for continuous theta2
// target += binomial_lpmf(y1 | N1, theta1); // lpmf for discrete y1
// target += binomial_lpmf(y2 | N2, theta2); // lpmf for discrete y2
// target is the log density to be sampled
}
generated quantities {
// generated quantities are computed after sampling
real oddsratio = (theta2 / (1 - theta2)) / (theta1 / (1 - theta1));
}
Sample from the posterior and plot the posterior
mod_bin2 <- cmdstan_model(stan_file = code_binom2)
fit_bin2 <- mod_bin2$sample(data = data_bin2, seed = SEED, refresh=1000)
fit_bin2$summary()
Histogram
draws <- fit_bin2$draws(format = "df")
mcmc_hist(draws, pars = 'oddsratio') +
geom_vline(xintercept = 1) +
scale_x_continuous(breaks = c(seq(0.25,1.5,by=0.25)))
Dots plot with median and 66% and 95% intervals
draws |>
ggplot(aes(x=oddsratio)) +
geom_dotsinterval() +
geom_vline(xintercept = 1) +
scale_x_continuous(breaks = c(seq(0.25,1.5,by=0.25)))+
labs(x='Odds ratio', y='')
Probability (and corresponding MCSE) that oddsratio<1
draws |>
mutate_variables(p_oddsratio_lt_1 = as.numeric(oddsratio<1)) |>
subset_draws("p_oddsratio_lt_1") |>
summarise_draws(prob=mean, MCSE=mcse_mean)
# A tibble: 1 × 3
variable prob MCSE
<chr> <dbl> <dbl>
1 p_oddsratio_lt_1 0.99 0.0020
The following file has Kilpisjärvi summer month temperatures 1952-2022 (data by Finnish Meteorological Institute, CC-BY 4.0).
data_kilpis <- read.delim(root("demos_rstan","kilpisjarvi-summer-temp-2022.csv"), sep = ";")
data_lin <-list(N = nrow(data_kilpis),
x = data_kilpis$year,
xpred = 2016,
y = data_kilpis[,5])
Plot the data
ggplot() +
geom_point(aes(x, y), data = data.frame(data_lin), size = 1) +
labs(y = 'Summer temp. @Kilpisjärvi', x= "Year") +
guides(linetype = "none")
To analyse whether the average summer month temperature is rising, we use a linear model with Gaussian model for the unexplained variation.
The folloing Stan code allows also setting hyperparameter values as data allowing easier way to use different priors in different analyses:
code_lin <- root("demos_rstan", "lin.stan")
writeLines(readLines(code_lin))
// Gaussian linear model with adjustable priors
data {
int<lower=0> N; // number of data points
vector[N] x; // covariate / predictor
vector[N] y; // target
real xpred; // new covariate value to make predictions
real pmualpha; // prior mean for alpha
real psalpha; // prior std for alpha
real pmubeta; // prior mean for beta
real psbeta; // prior std for beta
real pssigma; // prior std for half-normal prior for sigma
}
parameters {
real alpha; // intercept
real beta; // slope
real<lower=0> sigma; // standard deviation is constrained to be positive
}
transformed parameters {
// deterministic transformation of parameters and data
vector[N] mu = alpha + beta * x; // linear model
}
model {
alpha ~ normal(pmualpha, psalpha); // prior
beta ~ normal(pmubeta, psbeta); // prior
sigma ~ normal(0, pssigma); // as sigma is constrained to be positive,
// this is same as half-normal prior
y ~ normal(mu, sigma); // observation model / likelihood
// the notation using ~ is syntactic sugar for
// target += normal_lpdf(alpha | pmualpha, psalpha);
// target += normal_lpdf(beta | pmubeta, psbeta);
// target += normal_lpdf(y | mu, sigma);
// target is the log density to be sampled
}
generated quantities {
// sample from the predictive distribution
real ypred = normal_rng(alpha + beta * xpred, sigma);
// compute log predictive densities to be used for LOO-CV
vector[N] log_lik;
for (i in 1 : N) {
log_lik[i] = normal_lpdf(y[i] | mu[i], sigma);
}
}
Create another list with data and priors
data_lin_priors <- c(list(
pmualpha = mean(unlist(data_kilpis[,5])), # centered
psalpha = 100, # weakly informative
pmubeta = 0, # a priori incr. and decr. as likely
psbeta = (.1--.1)/6, # avg temp prob does does not incr. more than a degree per 10 years
pssigma = 1), # total variation in summer average temperatures is less +-3 degrees
data_lin)
Run Stan
mod_lin <- cmdstan_model(stan_file = code_lin)
fit_lin <- mod_lin$sample(data = data_lin_priors, seed = SEED, refresh=1000)
Stan gives a warning: There were X transitions after warmup that exceeded the maximum treedepth.
We can check the generic convergence diagnostics as follows
fit_lin$summary()
# A tibble: 147 × 10
variable mean median sd mad q5 q95 rhat ess_bulk
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 lp__ -41. -41. 1.2 1.1 -43. -40. 1.0 1171.
2 alpha -32. -32. 12. 12. -52. -13. 1.0 959.
3 beta 0.021 0.021 0.0060 0.0062 0.011 0.031 1.0 959.
4 sigma 1.1 1.1 0.092 0.091 0.94 1.2 1.0 1239.
5 mu[1] 8.7 8.7 0.25 0.25 8.3 9.1 1.0 1133.
6 mu[2] 8.7 8.7 0.24 0.24 8.3 9.1 1.0 1143.
7 mu[3] 8.7 8.7 0.24 0.24 8.3 9.1 1.0 1154.
8 mu[4] 8.8 8.8 0.23 0.23 8.4 9.1 1.0 1167.
9 mu[5] 8.8 8.8 0.23 0.23 8.4 9.1 1.0 1183.
10 mu[6] 8.8 8.8 0.22 0.22 8.4 9.2 1.0 1200.
# ℹ 137 more rows
# ℹ 1 more variable: ess_tail <dbl>
We can check the HMC specific diagnostics as follows
fit_lin$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
Warning: 5 of 4000 (0.0%) transitions hit the maximum treedepth limit of 10.
See https://mc-stan.org/misc/warnings for details.
$num_divergent
[1] 0 0 0 0
$num_max_treedepth
[1] 0 0 3 2
The high number of max treedepth exceedence is due to high posterior dependency, which in this case reduces efficiency, but doesn’t invalidate the results
draws_lin <- fit_lin$draws(format = "df")
draws_lin |>
mcmc_scatter(pars=c("alpha","beta"))
Compute the probability that the summer temperature is increasing.
mean(draws_lin[,"beta"]>0) # probability that beta > 0
[1] 1
Plot the data, the model fit and prediction for year 2016.
mu <- draws_lin |>
as_draws_df() |>
as_tibble() |>
select(starts_with("mu")) |>
apply(2, quantile, c(0.05, 0.5, 0.95)) |>
t() %>%
data.frame(x = data_lin$x, .) |>
gather(pct, y, -x)
pfit <- ggplot() +
geom_point(aes(x, y), data = data.frame(data_lin), size = 1) +
geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') +
scale_linetype_manual(values = c(2,1,2)) +
labs(y = 'Summer temp. @Kilpisjärvi', x= "Year") +
guides(linetype = "none")
phist <- mcmc_hist(draws_lin, pars = c('beta','sigma','ypred'))
grid.arrange(pfit, phist, nrow = 2)
In the above we used the unnormalized data and as x values are far away from zero, this will cause very strong posterior dependency between alpha and beta (did you use ShinyStan for the above model?). The strong posterior dependency can be removed by normalizing the data to have zero mean. The following Stan code makes it in Stan. In generated quantities we do correspnding transformation back to the original scale.
code_lin_std <- root("demos_rstan", "lin_std.stan")
writeLines(readLines(code_lin_std))
// Gaussian linear model with standardized data
data {
int<lower=0> N; // number of data points
vector[N] x; // covariate / predictor
vector[N] y; // target
real xpred; // new covariate value to make predictions
}
transformed data {
// deterministic transformations of data
vector[N] x_std = (x - mean(x)) / sd(x);
vector[N] y_std = (y - mean(y)) / sd(y);
real xpred_std = (xpred - mean(x)) / sd(x);
}
parameters {
real alpha; // intercept
real beta; // slope
real<lower=0> sigma_std; // standard deviation is constrained to be positive
}
transformed parameters {
// deterministic transformation of parameters and data
vector[N] mu_std = alpha + beta * x_std; // linear model
}
model {
alpha ~ normal(0, 1); // weakly informative prior (given standardized data)
beta ~ normal(0, 1); // weakly informative prior (given standardized data)
sigma_std ~ normal(0, 1); // weakly informative prior (given standardized data)
y_std ~ normal(mu_std, sigma_std); // observation model / likelihood
}
generated quantities {
// transform to the original data scale
vector[N] mu = mu_std * sd(y) + mean(y);
real<lower=0> sigma = sigma_std * sd(y);
// sample from the predictive distribution
real ypred = normal_rng((alpha + beta * xpred_std) * sd(y) + mean(y),
sigma_std * sd(y));
// compute log predictive densities to be used for LOO-CV
// to make appropriate comparison to other models, this log density is computed
// using the original data scale (y, mu, sigma)
vector[N] log_lik;
for (i in 1 : N) {
log_lik[i] = normal_lpdf(y[i] | mu[i], sigma);
}
}
mod_lin_std <- cmdstan_model(stan_file = code_lin_std)
fit_lin_std <- mod_lin_std$sample(data = data_lin, seed = SEED, refresh=1000)
Now there were no warnings. We can check diagnostics with the following commands.
fit_lin_std$summary()
# A tibble: 219 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 lp__ -31. -3.1e+1 1.3 1.0 -34. -30. 1.0 1893. 2281.
2 alpha 0.0012 8.3e-4 0.11 0.11 -0.18 0.18 1.0 3872. 2616.
3 beta 0.39 3.9e-1 0.11 0.11 0.21 0.57 1.0 3770. 2396.
4 sigma_std 0.93 9.3e-1 0.081 0.080 0.81 1.1 1.0 3544. 2711.
5 mu_std[1] -0.66 -6.6e-1 0.22 0.21 -1.0 -0.31 1.0 3642. 2228.
6 mu_std[2] -0.64 -6.4e-1 0.21 0.21 -0.99 -0.30 1.0 3645. 2229.
7 mu_std[3] -0.62 -6.3e-1 0.21 0.20 -0.97 -0.29 1.0 3645. 2249.
8 mu_std[4] -0.60 -6.1e-1 0.20 0.20 -0.94 -0.27 1.0 3644. 2263.
9 mu_std[5] -0.59 -5.9e-1 0.20 0.19 -0.91 -0.26 1.0 3650. 2263.
10 mu_std[6] -0.57 -5.7e-1 0.19 0.19 -0.89 -0.25 1.0 3650. 2333.
# ℹ 209 more rows
fit_lin_std$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
$num_divergent
[1] 0 0 0 0
$num_max_treedepth
[1] 0 0 0 0
We see that there are no warnings by diagnostics and ESS’s are higher than with the previous case with non-standardized data. The posterior has no high dependency.
draws_lin_std <- fit_lin_std$draws(format = "df")
draws_lin_std |>
mcmc_scatter(pars=c("alpha","beta"))
Next we check that we get similar probability for beta>0.
draws_lin_std <- fit_lin_std$draws(format = "df")
mean(draws_lin_std[,"beta"]>0) # probability that beta > 0
[1] 0.9995
The temperatures used in the above analyses are averages over three months, which makes it more likely that they are normally distributed, but there can be extreme events in the feather and we can check whether more robust Student’s \(t\) observation model woul give different results.
code_lin_std_t <- root("demos_rstan", "lin_std_t.stan")
writeLines(readLines(code_lin_std_t))
// Linear student-t model
data {
int<lower=0> N; // number of data points
vector[N] x; // covariate / predictor
vector[N] y; // target
real xpred; // new covariate value to make predictions
}
transformed data {
// deterministic transformations of data
vector[N] x_std = (x - mean(x)) / sd(x);
vector[N] y_std = (y - mean(y)) / sd(y);
real xpred_std = (xpred - mean(x)) / sd(x);
}
parameters {
real alpha; // intercept
real beta; // slope
real<lower=0> sigma_std; // standard deviation is constrained to be positive
real<lower=1> nu; // degrees of freedom is constrained >1
}
transformed parameters {
// deterministic transformation of parameters and data
vector[N] mu_std = alpha + beta * x_std; // linear model
}
model {
alpha ~ normal(0, 1); // weakly informative prior (given standardized data)
beta ~ normal(0, 1); // weakly informative prior (given standardized data)
sigma_std ~ normal(0, 1); // weakly informative prior (given standardized data)
nu ~ gamma(2, 0.1); // Juárez and Steel(2010)
y_std ~ student_t(nu, mu_std, sigma_std); // observation model / likelihood
}
generated quantities {
// transform to the original data scale
vector[N] mu = mu_std * sd(y) + mean(y);
real<lower=0> sigma = sigma_std * sd(y);
// sample from the predictive distribution
real ypred = student_t_rng(nu,
(alpha + beta * xpred_std) * sd(y) + mean(y),
sigma_std * sd(y));
// compute log predictive densities to be used for LOO-CV
// to make appropriate comparison to other models, this log density is computed
// using the original data scale (y, mu, sigma)
vector[N] log_lik;
for (i in 1 : N) {
log_lik[i] = student_t_lpdf(y[i] | nu, mu[i], sigma);
}
}
mod_lin_std_t <- cmdstan_model(stan_file = code_lin_std_t)
fit_lin_std_t <- mod_lin_std_t$sample(data = data_lin, seed = SEED, refresh=1000)
We get some warnings, but these specific warnings are not critical if counts are small as here.
Let’s examine further diagnostics.
fit_lin_std_t$summary()
# A tibble: 220 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 lp__ -5.3e+1 -5.2e+1 1.5 1.2 -55. -51. 1.0 1718. 2500.
2 alpha 3.6e-3 4.2e-3 0.11 0.11 -0.18 0.18 1.0 4000. 2694.
3 beta 4.0e-1 4.1e-1 0.11 0.11 0.22 0.59 1.0 4011. 2771.
4 sigma_std 8.9e-1 8.9e-1 0.088 0.087 0.76 1.0 1.0 3339. 2398.
5 nu 2.4e+1 2.1e+1 14. 13. 7.2 52. 1.0 3411. 2485.
6 mu_std[1] -6.8e-1 -6.9e-1 0.22 0.22 -1.0 -0.32 1.0 3683. 2656.
7 mu_std[2] -6.6e-1 -6.7e-1 0.22 0.22 -1.0 -0.31 1.0 3680. 2637.
8 mu_std[3] -6.4e-1 -6.5e-1 0.22 0.21 -0.99 -0.29 1.0 3673. 2619.
9 mu_std[4] -6.2e-1 -6.3e-1 0.21 0.21 -0.97 -0.28 1.0 3705. 2619.
10 mu_std[5] -6.0e-1 -6.1e-1 0.21 0.21 -0.94 -0.27 1.0 3694. 2693.
# ℹ 210 more rows
fit_lin_std_t$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
$num_divergent
[1] 0 0 0 0
$num_max_treedepth
[1] 0 0 0 0
We get similar diagnostics as for the linear Gaussian model with non-standardised data.
Compute the probability that the summer temperature is increasing.
draws_lin_std_t <- fit_lin_std_t$draws(format = "df")
mean(draws_lin_std_t[,"beta"]>0) # probability that beta > 0
[1] 0.99925
We get similar probability as with Gaussian obervation model.
Plot data and the model fit
mu <- draws_lin_std_t |>
as_tibble() |>
select(starts_with("mu[")) |>
apply(2, quantile, c(0.05, 0.5, 0.95)) |>
t() %>%
data.frame(x = data_lin$x, .) |>
gather(pct, y, -x)
pfit <- ggplot() +
geom_point(aes(x, y), data = data.frame(data_lin), size = 1) +
geom_line(aes(x, y, linetype = pct), data = mu, color = 'red') +
scale_linetype_manual(values = c(2,1,2)) +
labs(y = 'Summer temp. @Kilpisjärvi', x= "Year") +
guides(linetype = "none")
phist <- mcmc_hist(draws_lin_std_t, pars = c('beta','sigma','ypred'))
grid.arrange(pfit, phist, nrow = 2)
We see also that the marginal posterior of nu is wide with lot of mass for values producing distrbution really close to Gaussian.
We can use leave-one-out cross-validation to compare the expected predictive performance. For the following lines to work, the log-likelihood needs to be evaluated in the stan code. For an example, see lin.stan and Computing approximate leave-one-out cross-validation usig PSIS-LOO.
# At this moment there is not yet method for cmdstanr fit, so we use this helper
loo_lin_std <- fit_lin_std$loo()
loo_lin_std_t <- fit_lin_std_t$loo()
loo_compare(loo_lin_std, loo_lin_std_t)
elpd_diff se_diff
model1 0.0 0.0
model2 -0.4 0.4
There is no practical difference between Gaussian and Student’s \(t\) observation model for this data.
Let’s compare the temperatures in three summer months.
data_grp <-list(N = 3*nrow(data_kilpis[,]),
K = 3,
x = rep(1:3, nrow(data_kilpis[,])),
y = c(t(data_kilpis[,2:4])))
code_grp_aov <- root("demos_rstan", "grp_aov.stan")
writeLines(readLines(code_grp_aov))
// Comparison of k groups with common variance (ANOVA)
data {
int<lower=0> N; // number of observations
int<lower=0> K; // number of groups
array[N] int<lower=1, upper=K> x; // discrete group indicators
vector[N] y; // real valued observations
}
parameters {
vector[K] mu; // group means
real<lower=0> sigma; // common standard deviation constrained to be positive
}
model {
mu ~ normal(0, 100); // weakly informative prior
sigma ~ normal(0, 1); // weakly informative prior
y ~ normal(mu[x], sigma); // observation model / likelihood
}
Fit the model
mod_grp <- cmdstan_model(stan_file = code_grp_aov)
fit_grp <- mod_grp$sample(data = data_grp, seed = SEED, refresh=1000)
fit_grp$summary()
fit_grp$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
Results do not differ much from the previous, because there is only few groups and quite much data per group, but this works as an example of a hierarchical model.
code_grp_prior_mean <- root("demos_rstan", "grp_prior_mean.stan")
writeLines(readLines(code_grp_prior_mean))
// Comparison of k groups with common variance and
// hierarchical prior for the mean
data {
int<lower=0> N; // number of observations
int<lower=0> K; // number of groups
array[N] int<lower=1, upper=K> x; // discrete group indicators
vector[N] y; // real valued observations
}
parameters {
real mu0; // prior mean
real<lower=0> sigma0; // prior std constrained to be positive
vector[K] mu; // group means
real<lower=0> sigma; // common std constrained to be positive
}
model {
mu0 ~ normal(10, 10); // weakly informative prior
sigma0 ~ normal(0, 10); // weakly informative prior
mu ~ normal(mu0, sigma0); // population prior with unknown parameters
// log-normal prior sets normal prior on logarithm of the paremeter,
// which is useful for positive parameters that shouldn't be very
// close to 0. BDA3 Chapter 5 uses scaled inverse Chi^2 prior, but
// as these don't need to be (semi-)conjugate, thinking in terms of
// log-normal can be easier.
sigma ~ lognormal(0, .5); // weakly informative prior
y ~ normal(mu[x], sigma); // observation model / likelihood
}
Fit the model
mod_grp <- cmdstan_model(stan_file = code_grp_prior_mean)
fit_grp <- mod_grp$sample(data = data_grp, seed = SEED, refresh=1000)
fit_grp$summary()
fit_grp$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
We got a small number of divergences, so we increase adapt_delta=0.95
. Note that thisis useful only in case of small number of divergences, and increasing adapt_delta>0.99
doesn’t usually make sense, and in such cases there is need to investigate the identifiability or parameter transformations.
mod_grp <- cmdstan_model(stan_file = code_grp_prior_mean)
fit_grp <- mod_grp$sample(data = data_grp, seed = SEED, refresh=1000, adapt_delta=0.999)
fit_grp$summary()
fit_grp$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
code_grp_prior_mean_var <- root("demos_rstan", "grp_prior_mean_var.stan")
writeLines(readLines(code_grp_prior_mean_var))
// Comparison of k groups with unequal variance and
// hierarchical priors for the mean and the variance
data {
int<lower=0> N; // number of data points
int<lower=0> K; // number of groups
array[N] int<lower=1, upper=K> x; // group indicator
vector[N] y; //
}
parameters {
real mu0; // prior mean
real<lower=0> musigma0; // prior std
vector[K] mu; // group means
real lsigma0; // prior mean
real<lower=0> lsigma0s; // prior std
vector<lower=0>[K] sigma; // group stds
}
model {
mu0 ~ normal(10, 10); // weakly informative prior
musigma0 ~ normal(0, 10); // weakly informative prior
mu ~ normal(mu0, musigma0); // population prior with unknown parameters
// lsigma0 is prior mean in log scale
lsigma0 ~ normal(0, 1); // weakly informative prior
// log-normal prior sets normal prior on logarithm of the paremeter,
// which is useful for positive parameters that shouldn't be very
// close to 0. BDA3 Chapter 5 uses scaled inverse Chi^2 prior, but
// as these don't need to be (semi-)conjugate thinking in terms of
// log-normal can be easier.
lsigma0s ~ lognormal(log(0.1), .5); // weakly informative prior
sigma ~ lognormal(lsigma0, lsigma0s); // population prior with unknown parameters
y ~ normal(mu[x], sigma[x]); // observation model / likelihood
}
Fit the model
mod_grp <- cmdstan_model(stan_file = code_grp_prior_mean_var)
fit_grp <- mod_grp$sample(data = data_grp, seed = SEED, refresh=1000)
fit_grp$summary()
fit_grp$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
We got a small number of divergences, so we increase adapt_delta=0.95
. Note that thisis useful only in case of small number of divergences, and increasing adapt_delta>0.99
doesn’t usually make sense, and in such cases there is need to investigate the identifiability or parameter transformations.
mod_grp <- cmdstan_model(stan_file = code_grp_prior_mean_var)
fit_grp <- mod_grp$sample(data = data_grp, seed = SEED, refresh=1000, adapt_delta=0.95);
fit_grp$summary()
fit_grp$diagnostic_summary(diagnostics = c("divergences", "treedepth"))
Plot the results
temps <- fit_grp$draws(format = "df") |>
as_tibble() |>
select(starts_with("mu[")) |>
setNames(c('June','July','August'))
mcmc_areas(temps) + xlab('Temperature')
Probabilities that June is hotter than July, June is hotter than August and July is hotter than August:
paste('p(TempJun > TempJul) = ', mean(temps$June > temps$July))
[1] "p(TempJun > TempJul) = 0"
paste('p(TempJun > TempAug) = ', mean(temps$June > temps$August))
[1] "p(TempJun > TempAug) = 0"
paste('p(TempJul > TempAug) = ', mean(temps$July > temps$August))
[1] "p(TempJul > TempAug) = 1"