Sampling problems with latent variables: No vehicles in the park

Author

Andrew Gelman and Aki Vehtari

Published

2025-09-16

Modified

2026-06-15

This notebook includes the code for Bayesian Workflow book Chapter 29 Sampling problems with latent variables: No vehicles in the park.

1 Introduction

It can be hard to sample from a posterior of even a simple multilevel model. We demonstrate with an example that began with a post from (Luu 2024), which pointed to an online quiz from (Turner 2024) adapted from (Hart 1958) (see also Schlag (1999))

Load packages

library(rprojroot)
root <- has_file(".Bayesian-Workflow-root")$make_fix_file()
library(arm)
library(posterior)
options(posterior.num_args = list(digits = 2), digits = 2, width = 90)
library(lme4)
library(cmdstanr)
options(mc.cores = 4)
# CmdStanR output directory makes Quarto cache to work
dir.create(root("park_rule", "stan_output"), showWarnings = FALSE)
options(cmdstanr_output_dir = root("park_rule", "stan_output"))
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans", base_size = 14))
library(ggdist)
library(patchwork)
library(tictoc)
mytoc <- \() {
  toc(func.toc = \(tic, toc, msg) {
    sprintf("%s took %s sec", msg, as.character(signif(toc - tic, 2)))
  })}

print_stan_code <- function(code) {
  if (isTRUE(getOption("knitr.in.progress")) &
        identical(knitr::opts_current$get("results"), "asis")) {
    # In render: emit as-is so Pandoc/Quarto does syntax highlighting
    block <- paste0("```stan", "\n", paste(code, collapse = "\n"), "\n", "```")
    knitr::asis_output(block)
  } else {
    writeLines(code)
  }
}

2 Data

Data has 51953 responses from 2409 people giving yes/no answers to 27 questions about no vehicles in the park rule. Each row of the data file includes indexes for the respondent and the question, indicators if the person described in the survey question has a male-sounding or white-sounding name, and the outcome: 1 for the response, Yes, this violates the rule'' and 0 for the response,No, this does not violate the rule.’’

park <- read.csv(root("park_rule", "data", "park.csv"))
N <- nrow(park)
respondents <- sort(unique(park$submission_id))
J <- length(respondents)
respondent <- rep(NA, N)
for (j in 1:J) {
  respondent[park$submission_id == respondents[j]] <- j
}
items <- sort(unique(park$question_id))
K <- length(items)
item <- park$question_id
y <- park$answer
n_responses <- rep(NA, J)
for (j in 1:J) {
  n_responses[j] <- sum(respondent == j)
}
male_name <- park$male
white_name <- park$white

add number of answered questions as a predictor

n_responses_full <- n_responses[respondent]

3 lme4 model

lme4 performs fast fitting of simple multilevel models using approximate marginal maximum likelihood for the variance parameters. We start with lme4 for quick analysis to get some insights to the data.

data_park <- data.frame(y,
                        respondent,
                        item,
                        male_name,
                        white_name,
                        n_responses_full)
tic("lme4 fit 1")
fit_lme4 <- glmer(
  y ~ (1 | item) + (1 | respondent) + male_name + white_name + n_responses_full,
  family = binomial(link = "logit"),
  data = data_park
)
mytoc()

lme4 fit 1 took 14 sec

display(fit_lme4)

glmer(formula = y ~ (1 | item) + (1 | respondent) + male_name + 
    white_name + n_responses_full, data = data_park, family = binomial(link = "logit"))
                 coef.est coef.se
(Intercept)      -1.50     0.47  
male_name         0.07     0.03  
white_name        0.08     0.03  
n_responses_full -0.03     0.01  

Error terms:
 Groups     Name        Std.Dev.
 respondent (Intercept) 1.92    
 item       (Intercept) 2.31    
 Residual               1.00    
---
number of obs: 51953, groups: respondent, 2409; item, 27
AIC = 32517.4, DIC = 19437
deviance = 25971.2

3.1 Refit the model

Code the last predictor in terms of the number of items skipped so that zero is a reasonable baseline.

n_skipped <- K - n_responses
n_skipped_full <- n_skipped[respondent]
data_park <- data.frame(y,
                        respondent,
                        item,
                        male_name,
                        white_name,
                        n_skipped_full)

tic("lme4 fit 2")
fit_lme4 <- glmer(
  y ~ (1 | item) + (1 | respondent) + male_name + white_name + n_skipped_full,
  family = binomial(link = "logit"),
  data = data_park
)
mytoc()

lme4 fit 2 took 14 sec

display(fit_lme4)

glmer(formula = y ~ (1 | item) + (1 | respondent) + male_name + 
    white_name + n_skipped_full, data = data_park, family = binomial(link = "logit"))
               coef.est coef.se
(Intercept)    -2.42     0.45  
male_name       0.07     0.03  
white_name      0.08     0.03  
n_skipped_full  0.03     0.01  

Error terms:
 Groups     Name        Std.Dev.
 respondent (Intercept) 1.92    
 item       (Intercept) 2.31    
 Residual               1.00    
---
number of obs: 51953, groups: respondent, 2409; item, 27
AIC = 32517.4, DIC = 19437
deviance = 25971.2

mean(y)

[1] 0.24

mean(y[male_name == 0 & white_name == 0 & n_skipped_full == 0])

[1] 0.22

sum(male_name == 0 & white_name == 0 & n_skipped_full == 0)

[1] 10748

invlogit(-2.42 + 0.07*mean(male_name) + 0.08*mean(white_name) + 0.03*mean(n_skipped_full))

[1] 0.093

mean(predict(fit_lme4, type = "response"))

[1] 0.24

3.2 Simulate data and refit

Check with simulated data that inference works.

set.seed(123)
a_respondent_sim <- rnorm(J, 0, sqrt(VarCorr(fit_lme4)$respondent))
a_item_sim <- rnorm(K, 0, sqrt(VarCorr(fit_lme4)$item))
b_sim <- fixef(fit_lme4)
X <- cbind(1, male_name, white_name, n_skipped_full)
p_sim <- invlogit(a_respondent_sim[respondent] + a_item_sim[item] + X %*% b_sim)
y_sim <- rbinom(N, 1, p_sim)
data_sim <- data.frame(data_park, y_sim)

tic("lme4 fit sim")
fit_lme4_sim <- glmer(
  y_sim ~ (1 | item) + (1 | respondent) + male_name + white_name + n_responses_full,
  family = binomial(link = "logit"),
  data = data_sim
)
mytoc()

lme4 fit sim took 15 sec

display(fit_lme4_sim)

glmer(formula = y_sim ~ (1 | item) + (1 | respondent) + male_name + 
    white_name + n_responses_full, data = data_sim, family = binomial(link = "logit"))
                 coef.est coef.se
(Intercept)      -1.88     0.53  
male_name         0.06     0.03  
white_name        0.11     0.03  
n_responses_full -0.04     0.01  

Error terms:
 Groups     Name        Std.Dev.
 respondent (Intercept) 1.87    
 item       (Intercept) 2.65    
 Residual               1.00    
---
number of obs: 51953, groups: respondent, 2409; item, 27
AIC = 29244.3, DIC = 16807
deviance = 23019.5

a_item_hat <- ranef(fit_lme4)$item
print(a_item_hat, digits = 1)

   (Intercept)
1         7.61
2         3.01
3        -0.74
4        -1.02
5         1.15
6        -0.97
7        -0.51
8         1.99
9         3.23
10        1.34
11       -1.25
12       -3.17
13        0.49
14       -1.24
15       -0.59
16        0.61
17       -2.78
18       -1.59
19        0.02
20       -2.79
21        1.04
22        0.30
23       -0.05
24       -1.49
25       -0.04
26       -0.50
27        2.01

3.3 Plots

wordings <- read.csv(root("park_rule", "data", "park.txt"), header = FALSE)$V2
wordings <- substr(wordings, 2, nchar(wordings) - 1)
a_item_hat <- unlist(a_item_hat)
names(a_item_hat) <- wordings
print(sort(a_item_hat), digits=1)

              kite     paper_airplane                iss             toycar 
             -3.17              -2.79              -2.78              -1.59 
        ice_skates            toyboat          parachute              plane 
             -1.49              -1.25              -1.24              -1.02 
         surfboard             skates skateboard_carried         wheelchair 
             -0.97              -0.74              -0.59              -0.51 
          stroller            travois               sled              rccar 
             -0.50              -0.05              -0.04               0.02 
             horse         quadcopter         skateboard         wagon_kids 
              0.30               0.49               0.61               1.04 
             wagon            rowboat               bike           memorial 
              1.15               1.34               1.99               2.01 
         ambulance             police                car 
              3.01               3.23               7.61

item_avg <- rep(NA, K)
for (k in 1:K) {
  item_avg[k] <- mean(y[item==k])
}
plot(item_avg, a_item_hat, pch=20)

plot(logit(item_avg), a_item_hat, type = "n")
text(logit(item_avg), a_item_hat, names(a_item_hat), cex = .5)

a_respondent_hat <- unlist(ranef(fit_lme4)$respondent)
respondent_avg <- rep(NA, J)
for (j in 1:J) {
  respondent_avg[j] <- mean(y[respondent == j])
}
plot(respondent_avg, a_respondent_hat, pch = 20, cex = .4)

4 Stan models

In general we prefer full Bayesian inference, even if it may take more time.

4.1 Stan data

CmdStanR needs the data in list format.

X <- cbind(male_name, white_name, n_skipped_full)
stan_data <- list(
  N = N,
  J = J,
  K = K,
  L = ncol(X),
  y = y,
  respondent = respondent,
  item = item,
  X = X
)

4.2 Hierarchical logistic regression with non-centered parameterization

Hierarchical models of often benefit from non-centered parameterization, so we start with that. We use <multiplier=...> declaration to implement the non-centered parameterization. In the model block, we use bernoulli_logit_glm(), which is more efficient than bernoulli_logit() and can be used when the latent model can be presented as a linear model. We use weak priors for the coefficients and varying effect population scales.

park_1 <- cmdstan_model(root("park_rule", "park_1.stan"))

print_stan_code(park_1$code())

data {
  int<lower=0> N, J, K, L;
  array[N] int<lower=0, upper=1> y;
  array[N] int<lower=1, upper=J> respondent;
  array[N] int<lower=1, upper=K> item;
  matrix[N, L] X;
}
parameters {
  real a;
  vector[L] b;
  real<lower=0> sigma_respondent, sigma_item;
  vector<multiplier=sigma_respondent>[J] a_respondent;
  vector<multiplier=sigma_item>[K] a_item;
}
model {
  a_respondent ~ normal(0, sigma_respondent);
  a_item ~ normal(0, sigma_item);
  b ~ normal(0, 1);
  {sigma_respondent, sigma_item} ~ normal(0, 3);
  y ~ bernoulli_logit_glm(X, a + a_respondent[respondent] + a_item[item], b);
}

When using the default sampling options and working interactively, we quickly see very slow sampling. To investigate, we switch to use one fifth of the iterations, and sampling takes about 6 minutes. Furthermore, we use option init = 0.1 to initialize the unconstrained parameters with random uniform values from range [-0.1,0.1], which is often better initialization for varying effects than the default range [-2,2].

tic("Stan sampling model 1")
fit_1 <- park_1$sample(data = stan_data, init = 0.1,
                       iter_warmup = 200, iter_sampling = 200)

mytoc()

Stan sampling model 1 took 290 sec

print(fit_1)

         variable     mean   median    sd   mad       q5      q95 rhat ess_bulk ess_tail
 lp__             -1.5e+04 -1.5e+04 46.96 45.25 -1.5e+04 -1.5e+04 1.05       93      252
 a                -2.4e+00 -2.4e+00  0.46  0.46 -3.1e+00 -1.5e+00 1.29       11       30
 b[1]              7.0e-02  7.0e-02  0.03  0.03  2.0e-02  1.2e-01 1.00     1608      707
 b[2]              8.0e-02  8.0e-02  0.03  0.03  3.0e-02  1.4e-01 1.01     1874      613
 b[3]              3.0e-02  3.0e-02  0.01  0.01  2.0e-02  4.0e-02 1.01      336      425
 sigma_respondent  1.9e+00  1.9e+00  0.04  0.04  1.9e+00  2.0e+00 1.02      239      529
 sigma_item        2.4e+00  2.4e+00  0.37  0.34  1.9e+00  3.1e+00 1.02      127      191
 a_respondent[1]  -2.9e-01 -3.1e-01  1.69  1.83 -3.1e+00  2.4e+00 1.01     2174      621
 a_respondent[2]  -4.0e-01 -3.9e-01  0.70  0.69 -1.7e+00  6.8e-01 1.02     1145      458
 a_respondent[3]   1.4e+00  1.4e+00  0.48  0.49  5.9e-01  2.1e+00 1.00     1436      522

 # showing 10 of 2443 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

We see some suspiciously high \widehat{R} values and low ESSs. We didn’t get divergence warnings, so the reason is unlikely to be a funnel shaped posterior. We also didn’t get maximum treedepth exceedences warnings, so the there is no obvious high correlations. We can further examine the sampler diagnostics:

fit_1$sampler_diagnostics() |> as_draws_rvars()

# A draws_rvars: 200 iterations, 4 chains, and 6 variables
$treedepth__: rvar<200,4>[1] mean ± sd:
[1] 6.8 ± 0.4 

$divergent__: rvar<200,4>[1] mean ± sd:
[1] 0 ± 0 

$energy__: rvar<200,4>[1] mean ± sd:
[1] 16306 ± 57 

$accept_stat__: rvar<200,4>[1] mean ± sd:
[1] 0.93 ± 0.085 

$stepsize__: rvar<200,4>[1] mean ± sd:
[1] 0.044 ± 0.007 

$n_leapfrog__: rvar<200,4>[1] mean ± sd:
[1] 118 ± 22

We are specifically interested in how efficient each Hamiltonian Monte Carlo iteration is. This can be measured by the number of leapfrog steps n_leapfrog__, which is close to the number of log density and gradient evaluations. Instead of examining n_leapfrog__ directly, it is common to examine treedepth__ as it scales logarithmically with respect to n_leapfrog__. More specifically, \text{treedepth\_\_}=\log_2(\text{treedepth\_\_}+1). Average treedepth__ is about 7 which is not high for hierarchical model posteriors, and the variation measured by standard deviation is low which indicates that the posterior curvature is not highly varying and thus there is no strong funnel shape. As ESSs are low for the parameter a, we check the trace plot.

draws_1 <- fit_1$draws(format = "df")

draws_1 |>
  mcmc_trace(pars = "a") +
  labs(x = "Iteration")

There is clearly high auto-correlation. As we have 51953 observations, we would expect the posterior for a to be narrow, but now the posterior standard deviation is 0.46.

Examining the model code, we see

  y ~ bernoulli_logit_glm(X, a + a_respondent[respondent] + a_item[item], b);

and remember the discussion in Section 12.3 about parameters with similar roles and identifiability. Here all a, a_respondent and a_item influence the total intercept. If value of a increases, the total intercept stays the same if values of a_respondent and a_item get lower at the same time. These parameters are not well identified alone.

Let’s check the diagnostics for a_respondent and a_item, too.

draws_1 |>
  subset_draws(variable = "a_respondent") |>
  summarize_draws()

# A tibble: 2,409 × 10
   variable          mean median    sd   mad    q5   q95  rhat ess_bulk ess_tail
   <chr>            <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1 a_respondent[1]  -0.29  -0.31  1.69  1.83 -3.09  2.37  1.01  2174.04   621.43
 2 a_respondent[2]  -0.40  -0.39  0.70  0.69 -1.67  0.68  1.02  1145.63   458.25
 3 a_respondent[3]   1.35   1.36  0.48  0.49  0.59  2.13  1.00  1436.17   522.30
 4 a_respondent[4]  -0.58  -0.51  1.18  1.17 -2.66  1.28  1.00  1641.24   561.18
 5 a_respondent[5]  -0.55  -0.47  1.12  1.17 -2.45  1.20  1.01  1586.11   710.98
 6 a_respondent[6]  -0.91  -0.86  0.76  0.79 -2.17  0.29  1.00  1504.70   515.67
 7 a_respondent[7]  -2.63  -2.56  1.09  1.16 -4.56 -0.99  1.01  1175.91   517.02
 8 a_respondent[8]   0.76   0.76  0.57  0.58 -0.17  1.67  1.00  1087.90   464.82
 9 a_respondent[9]  -2.57  -2.49  1.03  1.04 -4.42 -1.00  1.01   816.80   446.94
10 a_respondent[10]  1.99   1.99  0.66  0.62  0.83  3.05  1.01  1327.17   554.86
# ℹ 2,399 more rows

draws_1 |>
  subset_draws(variable = "a_item") |>
  summarize_draws()

# A tibble: 27 × 10
   variable    mean median    sd   mad    q5   q95  rhat ess_bulk ess_tail
   <chr>      <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1 a_item[1]   8.21   8.23  0.52  0.55  7.31  9.04  1.21    14.32    56.19
 2 a_item[2]   3.00   3.03  0.47  0.44  2.17  3.76  1.27    12.04    36.25
 3 a_item[3]  -1.04  -1.00  0.47  0.46 -1.88 -0.30  1.27    12.22    52.87
 4 a_item[4]  -1.34  -1.30  0.47  0.46 -2.16 -0.57  1.26    12.31    46.47
 5 a_item[5]   0.96   1.00  0.47  0.45  0.12  1.69  1.28    11.76    33.13
 6 a_item[6]  -1.29  -1.26  0.47  0.46 -2.12 -0.56  1.25    12.64    28.31
 7 a_item[7]  -0.80  -0.75  0.47  0.46 -1.62 -0.06  1.26    12.19    29.59
 8 a_item[8]   1.84   1.88  0.47  0.46  1.04  2.59  1.27    11.96    24.67
 9 a_item[9]   3.20   3.24  0.47  0.45  2.39  3.95  1.28    11.72    29.28
10 a_item[10]  1.15   1.19  0.47  0.46  0.32  1.88  1.27    11.96    43.40
# ℹ 17 more rows

The ESSs for a_respondent are high and ESSs for a_item are low. As each a_respondent depends on a smaller number of observations, the related uncertainty swamps the autocorrelation due to the dependency.

We can examine the scatter plot of a and sum of a_item to see the strong correlation.

draws_rvars(a = as_draws_rvars(draws_1)$a,
            sum_a_item = as_draws_rvars(draws_1)$a_item |> rvar_sum()) |>
  mcmc_scatter(alpha = 0.5) +
  labs(y = "sum(a_item)")

4.3 Hierarchical logistic regression with sum-to-zero parameterization

We can remove the identifiability problem by constraining a_respondent and a_item to have sum zero. This can be easily achieved in Stan with sum_to_zero_vector data type. In addition of potentially improving sampling speed, sum-to-zero constraint reducing posterior dependencies improves also interpretability of the marginal posteriors of the parameters. As sum_to_zero_vector data type does not allow multiplier, we need to change how we implement the non-centered parameterization.

park_2 <- cmdstan_model(root("park_rule", "park_2.stan"))

print_stan_code(park_2$code())

data {
  int<lower=0> N, J, K, L;
  array[N] int<lower=0, upper=1> y;
  array[N] int<lower=1, upper=J> respondent;
  array[N] int<lower=1, upper=K> item;
  matrix[N, L] X;
}
parameters {
  real a;
  vector[L] b;
  real<lower=0> sigma_respondent, sigma_item;
  sum_to_zero_vector[J] z_respondent;
  sum_to_zero_vector[K] z_item;
}
transformed parameters {
  vector[J] a_respondent = z_respondent * sigma_respondent;
  vector[K] a_item = z_item * sigma_item;
}
model {
  z_respondent ~ std_normal();
  z_item ~ std_normal();
  b ~ normal(0, 1);
  {sigma_respondent, sigma_item} ~ normal(0, 3);
  y ~ bernoulli_logit_glm(X, a + a_respondent[respondent] + a_item[item], b);
}

tic("Stan sampling model 2")
fit_2 <- park_2$sample(data = stan_data, init = 0.1,
                       iter_warmup = 200, iter_sampling = 200)

mytoc()

Stan sampling model 2 took 280 sec

draws_2 <- fit_2$draws(format = "df")
print(fit_2)

         variable     mean   median    sd   mad       q5      q95 rhat ess_bulk ess_tail
 lp__             -1.5e+04 -1.5e+04 50.97 48.72 -1.5e+04 -1.5e+04 1.02      165      222
 a                -2.4e+00 -2.4e+00  0.04  0.04 -2.5e+00 -2.4e+00 1.00      584      747
 b[1]              7.0e-02  7.0e-02  0.03  0.03  2.0e-02  1.2e-01 1.01     1570      607
 b[2]              8.0e-02  8.0e-02  0.03  0.04  3.0e-02  1.3e-01 1.00     2120      674
 b[3]              3.0e-02  3.0e-02  0.01  0.01  3.0e-02  4.0e-02 1.01      337      489
 sigma_respondent  1.9e+00  1.9e+00  0.04  0.04  1.9e+00  2.0e+00 1.01      277      447
 sigma_item        2.5e+00  2.4e+00  0.33  0.31  2.0e+00  3.1e+00 1.03      117      221
 z_respondent[1]  -1.7e-01 -1.2e-01  0.88  0.85 -1.7e+00  1.2e+00 1.00      590      490
 z_respondent[2]  -2.2e-01 -2.0e-01  0.36  0.38 -8.6e-01  3.6e-01 1.01      613      594
 z_respondent[3]   7.0e-01  7.1e-01  0.26  0.24  2.8e-01  1.1e+00 1.01      937      580

 # showing 10 of 4879 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

The sampling time is reduced about 10%, and we get a big improvement in \widehat{R} and ESSs for a. The posterior standard deviation of a is 0.04, which is much more sensible considering the data size.

Now \widehat{R} and ESSs indicate problems with sigma_item. Examining the sampler diagnostics, shows that treedepth is lower, indicating smaller posterior correlations, and still with low standard deviation, indicating that curvature is not highly varying.

fit_2$sampler_diagnostics() |> as_draws_rvars()

# A draws_rvars: 200 iterations, 4 chains, and 6 variables
$treedepth__: rvar<200,4>[1] mean ± sd:
[1] 6.5 ± 0.5 

$divergent__: rvar<200,4>[1] mean ± sd:
[1] 0 ± 0 

$energy__: rvar<200,4>[1] mean ± sd:
[1] 16304 ± 63 

$accept_stat__: rvar<200,4>[1] mean ± sd:
[1] 0.93 ± 0.083 

$stepsize__: rvar<200,4>[1] mean ± sd:
[1] 0.06 ± 0.017 

$n_leapfrog__: rvar<200,4>[1] mean ± sd:
[1] 95 ± 32

We examine the convergence diagnostics for a_item.

draws_2 |>
  subset_draws(variable = "a_item") |>
  summarize_draws()

# A tibble: 27 × 10
   variable    mean median    sd   mad    q5   q95  rhat ess_bulk ess_tail
   <chr>      <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1 a_item[1]   8.27   8.27  0.23  0.22  7.91  8.67  1.00   770.25   663.37
 2 a_item[2]   3.06   3.06  0.06  0.06  2.97  3.15  1.00   798.98   760.88
 3 a_item[3]  -0.98  -0.98  0.08  0.07 -1.11 -0.85  1.00   786.48   701.58
 4 a_item[4]  -1.27  -1.27  0.09  0.09 -1.41 -1.13  1.00   789.87   798.24
 5 a_item[5]   1.02   1.02  0.06  0.06  0.93  1.11  1.00   815.75   751.34
 6 a_item[6]  -1.22  -1.21  0.09  0.09 -1.37 -1.08  1.00   994.48   701.54
 7 a_item[7]  -0.74  -0.74  0.08  0.08 -0.88 -0.61  1.01   998.69   447.51
 8 a_item[8]   1.91   1.91  0.07  0.07  1.81  2.03  1.01  1029.50   627.94
 9 a_item[9]   3.27   3.27  0.07  0.07  3.16  3.38  1.00   791.59   762.84
10 a_item[10]  1.22   1.23  0.06  0.07  1.12  1.33  1.00  2089.46   637.61
# ℹ 17 more rows

The usual way to investigate funnels in hierarchical models, is to look at the scatter plot of one of the variables and corresponding prior scale. We examine a_item and sigma_item. Instead of showing plot for all variables in vector a_item, we show here the scatter plot for a_item[1] and `sigma_item´.

draws_2 |>
  subset_draws(variable = c("a_item[1]","sigma_item")) |>
  mcmc_scatter(alpha = 0.5)

There is no indication of funnel. This is hiding the issue, as the sampling was actually done using parameters z_respondent and z_item!

  sum_to_zero_vector[J] z_respondent;
  sum_to_zero_vector[K] z_item;

Thus, we should investigate the sampling performance for z_item:

draws_2 |>
  subset_draws(variable = "z_item") |>
  summarize_draws()

# A tibble: 27 × 10
   variable    mean median    sd   mad    q5   q95  rhat ess_bulk ess_tail
   <chr>      <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
 1 z_item[1]   3.43   3.43  0.44  0.44  2.71  4.15  1.03   128.65   218.10
 2 z_item[2]   1.27   1.27  0.17  0.16  0.98  1.54  1.03   122.21   287.14
 3 z_item[3]  -0.41  -0.40  0.06  0.06 -0.51 -0.31  1.02   162.14   355.05
 4 z_item[4]  -0.53  -0.52  0.08  0.07 -0.65 -0.41  1.02   150.34   395.81
 5 z_item[5]   0.42   0.42  0.06  0.06  0.33  0.53  1.03   145.52   229.88
 6 z_item[6]  -0.50  -0.50  0.07  0.08 -0.63 -0.39  1.02   146.13   369.87
 7 z_item[7]  -0.31  -0.30  0.05  0.05 -0.39 -0.23  1.01   206.06   392.34
 8 z_item[8]   0.79   0.79  0.11  0.10  0.62  0.96  1.03   141.23   306.76
 9 z_item[9]   1.35   1.35  0.17  0.17  1.06  1.64  1.02   118.12   268.83
10 z_item[10]  0.51   0.50  0.07  0.07  0.39  0.63  1.02   149.10   312.45
# ℹ 17 more rows

Now \widehat{R} and ESSs indicate problems with z_item. We examine the scatter plot for z_item[1] and sigma_item:

draws_2 |>
  subset_draws(variable = c("z_item[1]","sigma_item")) |>
  mcmc_scatter(alpha = 0.5)

There is a strong correlation. There is also slight banana shape, but as sigma_item is constrained to be positive, the sampling is done in log space, and we should use that also for the scatter plot.

draws_2 |>
  subset_draws(variable = c("z_item[1]","sigma_item")) |>
  mcmc_scatter(transformations = list(sigma_item=log), alpha = 0.5) +
  labs(y = "log(sigma_item)")

The banana shape is weaker, and we have mostly linear dependency which matches what inferred from the diagnostic values.

When non-centered parameterization is used, but the likelihood contribution is strong, we get strong dependency between the latent value z and sigma. In this case, we have a large number of observations per each item, and the centered parameterization is likely to be a better choice.

In our first model, the non-centered parameterization was implemented using vector data type with <multiplier=...>, which hides the latent parameter, and it is easier to miss how to detect the problem. With the explicit latent parameterization z_item, the issue was easier to detect.

4.4 Hierarchical logistic regression with sum-to-zero parameterization

We switch to using the centered parameterization for both a_respondent and a_item. We could still use non-centered for a_respondent, but further experiments not shown here, indicate that there is not much difference between the parameterizations for a_respondent and thus we use the simpler form.

park_3 <- cmdstan_model(root("park_rule", "park_3.stan"))

print_stan_code(park_3$code())

data {
  int<lower=0> N, J, K, L;
  array[N] int<lower=0, upper=1> y;
  array[N] int<lower=1, upper=J> respondent;
  array[N] int<lower=1, upper=K> item;
  matrix[N, L] X;
}
parameters {
  real a;
  vector[L] b;
  real<lower=0> sigma_respondent, sigma_item;
  sum_to_zero_vector[J] a_respondent;
  sum_to_zero_vector[K] a_item;
}
model {
  a_respondent ~ normal(0, sigma_respondent);
  a_item ~ normal(0, sigma_item);
  b ~ normal(0, 1);
  {sigma_respondent, sigma_item} ~ normal(0, 3);
  y ~ bernoulli_logit_glm(X, a + a_respondent[respondent] + a_item[item], b);
}

tic("Stan sampling model 3")
fit_3 <- park_3$sample(data = stan_data, init = 0.1,
                       iter_warmup = 200, iter_sampling = 200)

mytoc()

Stan sampling model 3 took 270 sec

draws_3 <- fit_3$draws(format = "df")
print(fit_3)

         variable     mean   median    sd   mad       q5      q95 rhat ess_bulk ess_tail
 lp__             -1.7e+04 -1.7e+04 40.57 39.74 -1.7e+04 -1.7e+04 1.01      292      423
 a                -2.4e+00 -2.4e+00  0.04  0.04 -2.5e+00 -2.4e+00 1.00      437      370
 b[1]              7.0e-02  7.0e-02  0.03  0.03  2.0e-02  1.2e-01 1.00     1371      654
 b[2]              8.0e-02  8.0e-02  0.03  0.03  3.0e-02  1.3e-01 1.01     1948      772
 b[3]              3.0e-02  3.0e-02  0.01  0.01  2.0e-02  4.0e-02 1.01      336      421
 sigma_respondent  1.9e+00  1.9e+00  0.04  0.04  1.9e+00  2.0e+00 1.00      448      691
 sigma_item        2.4e+00  2.4e+00  0.31  0.31  1.9e+00  3.0e+00 1.00     1458      593
 a_respondent[1]  -3.5e-01 -2.6e-01  1.77  1.69 -3.3e+00  2.5e+00 1.00      743      496
 a_respondent[2]  -4.0e-01 -3.5e-01  0.73  0.73 -1.6e+00  7.3e-01 1.00      911      455
 a_respondent[3]   1.4e+00  1.4e+00  0.51  0.50  5.7e-01  2.3e+00 1.01      855      643

 # showing 10 of 2443 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

The sampling time is reduced about 10%, and we get a big improvement in \widehat{R} and ESSs for sigma_item.

Examining the sampler diagnostics, shows that treedepth is further reduced, indicating the posterior is easier than with the first and second model parameterizations.

fit_3$sampler_diagnostics() |> as_draws_rvars()

# A draws_rvars: 200 iterations, 4 chains, and 6 variables
$treedepth__: rvar<200,4>[1] mean ± sd:
[1] 5 ± 0 

$divergent__: rvar<200,4>[1] mean ± sd:
[1] 0 ± 0 

$energy__: rvar<200,4>[1] mean ± sd:
[1] 17939 ± 52 

$accept_stat__: rvar<200,4>[1] mean ± sd:
[1] 0.87 ± 0.11 

$stepsize__: rvar<200,4>[1] mean ± sd:
[1] 0.17 ± 0.0018 

$n_leapfrog__: rvar<200,4>[1] mean ± sd:
[1] 31 ± 0

4.5 Hierarchical logistic regression with sum-to-zero parameterization and centered predictors

We can further reduce posterior dependencies by centering the predictor values. We can do the centering in Stan code block transformed data.

park_4 <- cmdstan_model(root("park_rule", "park_4.stan"))

print_stan_code(park_4$code())

data {
  int<lower=0> N, J, K, L;
  array[N] int<lower=0, upper=1> y;
  array[N] int<lower=1, upper=J> respondent;
  array[N] int<lower=1, upper=K> item;
  matrix[N, L] X;
}
transformed data {
  matrix[N, L] X_c;
  for (l in 1:L) {
    X_c[, l] = X[, l] - mean(X[, l]);
  }
}
parameters {
  real a;
  vector[L] b;
  real<lower=0> sigma_respondent, sigma_item;
  sum_to_zero_vector[J] a_respondent;
  sum_to_zero_vector[K] a_item;
}
model {
  a_respondent ~ normal(0, sigma_respondent);
  a_item ~ normal(0, sigma_item);
  b ~ normal(0, 1);
  {sigma_respondent, sigma_item} ~ normal(0, 3);
  y ~ bernoulli_logit_glm(X_c, a + a_respondent[respondent] + a_item[item], b);
}

tic("Stan sampling model 4")
fit_4 <- park_4$sample(data = stan_data, init = 0.1,
                       iter_warmup = 200, iter_sampling = 200)

mytoc()

Stan sampling model 4 took 260 sec

draws_4 <- fit_4$draws(format = "df")
print(fit_4)

         variable     mean   median    sd   mad       q5      q95 rhat ess_bulk ess_tail
 lp__             -1.7e+04 -1.7e+04 39.32 38.47 -1.7e+04 -1.7e+04 1.02      213      355
 a                -2.3e+00 -2.3e+00  0.03  0.03 -2.3e+00 -2.2e+00 1.00      366      421
 b[1]              7.0e-02  7.0e-02  0.03  0.03  2.0e-02  1.2e-01 1.00     2156      565
 b[2]              8.0e-02  8.0e-02  0.03  0.03  3.0e-02  1.3e-01 1.01     1842      682
 b[3]              3.0e-02  3.0e-02  0.01  0.01  2.0e-02  4.0e-02 1.02      259      486
 sigma_respondent  1.9e+00  1.9e+00  0.04  0.04  1.9e+00  2.0e+00 1.00      584      733
 sigma_item        2.4e+00  2.4e+00  0.34  0.31  1.9e+00  3.0e+00 1.01     1647      634
 a_respondent[1]  -3.5e-01 -3.5e-01  1.60  1.68 -3.0e+00  2.1e+00 1.01      819      714
 a_respondent[2]  -3.7e-01 -3.7e-01  0.67  0.59 -1.5e+00  7.2e-01 1.01      798      528
 a_respondent[3]   1.3e+00  1.4e+00  0.53  0.55  4.0e-01  2.2e+00 1.01      818      531

 # showing 10 of 2443 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

The sampling time is reduced by about 10%. Compared to the first model the sampling time has reduced about 30%, all convergence diagnostics look better and effective sample size per iteration is much higher. The The posterior standard deviation of a is further reduced to 0.04 as the correlation between a and predictor coefficients has been removed by centering the predictors.

Looking at the sampler diagnostics, the treedepth is further reduced compared to the previous model and posterior.

fit_4$sampler_diagnostics() |> as_draws_rvars()

# A draws_rvars: 200 iterations, 4 chains, and 6 variables
$treedepth__: rvar<200,4>[1] mean ± sd:
[1] 5 ± 0 

$divergent__: rvar<200,4>[1] mean ± sd:
[1] 0 ± 0 

$energy__: rvar<200,4>[1] mean ± sd:
[1] 17943 ± 54 

$accept_stat__: rvar<200,4>[1] mean ± sd:
[1] 0.85 ± 0.12 

$stepsize__: rvar<200,4>[1] mean ± sd:
[1] 0.19 ± 0.015 

$n_leapfrog__: rvar<200,4>[1] mean ± sd:
[1] 31 ± 1.1

We refit the final model using the default number of iterations.

tic("Stan sampling model 4")
fit_4 <- park_4$sample(data = stan_data, init = 0.1)

mytoc()

Stan sampling model 4 took 450 sec

print(fit_4)

         variable     mean   median    sd   mad       q5      q95 rhat ess_bulk ess_tail
 lp__             -1.7e+04 -1.7e+04 38.17 38.27 -1.7e+04 -1.7e+04 1.00     1292     2105
 a                -2.3e+00 -2.3e+00  0.03  0.03 -2.3e+00 -2.2e+00 1.00     2170     2178
 b[1]              7.0e-02  7.0e-02  0.03  0.03  2.0e-02  1.2e-01 1.00     9704     2566
 b[2]              8.0e-02  8.0e-02  0.03  0.03  3.0e-02  1.3e-01 1.00     6263     2691
 b[3]              3.0e-02  3.0e-02  0.01  0.01  2.0e-02  4.0e-02 1.00     1747     2335
 sigma_respondent  1.9e+00  1.9e+00  0.04  0.04  1.9e+00  2.0e+00 1.00     2578     3030
 sigma_item        2.4e+00  2.4e+00  0.34  0.31  1.9e+00  3.0e+00 1.00     6800     2697
 a_respondent[1]  -3.7e-01 -3.5e-01  1.76  1.78 -3.3e+00  2.5e+00 1.00     5105     3109
 a_respondent[2]  -3.8e-01 -3.3e-01  0.71  0.66 -1.6e+00  7.3e-01 1.00     3653     2385
 a_respondent[3]   1.3e+00  1.4e+00  0.53  0.53  4.5e-01  2.2e+00 1.00     3915     2712

 # showing 10 of 2443 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)

\widehat{R} and ESS diagnostics look good. Although we are running five times more iterations, the sampling time is only 50% longer, which is due to 1) the most of the time spend in the warmup before the adaptation is good, and 2) with longer adaptation the actual sampling is faster. Let’s check the sampler diagnostics.

fit_4$sampler_diagnostics() |> as_draws_rvars()

# A draws_rvars: 1000 iterations, 4 chains, and 6 variables
$treedepth__: rvar<1000,4>[1] mean ± sd:
[1] 4.9 ± 0.29 

$divergent__: rvar<1000,4>[1] mean ± sd:
[1] 0 ± 0 

$energy__: rvar<1000,4>[1] mean ± sd:
[1] 17942 ± 51 

$accept_stat__: rvar<1000,4>[1] mean ± sd:
[1] 0.86 ± 0.12 

$stepsize__: rvar<1000,4>[1] mean ± sd:
[1] 0.19 ± 0.0085 

$n_leapfrog__: rvar<1000,4>[1] mean ± sd:
[1] 30 ± 3.9

The average treedepth__ has further reduced, which means the number of log density and gradient evaluations per iteration is reduced approximately by 23% compared to running the algorithm with fewer iteration. This further reduction comes from better adaptation of the mass matrix and step size during the warmup. The total sampling time is about 6 minutes with my laptop.

4.6 Comparison of lme4 and Bayesian posterior estimates

As there are quite many observations and lme4 is also using marginalization over the latent values when estimating the standard deviations of the varying intercepts, is there much difference in the results?

We compare the point estimates (conditional mode for lme4, posterior mean for Bayes) and 90% intervals (normal approximation for lme4, central posterior interval for Bayes) for a_item and a_respondent.

dr4 <- fit_4$draws() |> as_draws_rvars()
a_item_hat <- ranef(fit_lme4)$item
a_item_sd <- sqrt(as.numeric(attr(a_item_hat, "postVar")))
a_item_h <- as.numeric(a_item_hat$`(Intercept)`)
rng <- range(summarize_draws(
  dr4$a_item, ~quantile(.x, probs = c(0.05, 0.95)))[,c("5%","95%")])
ggplot(data=NULL) +
  coord_fixed(xlim = rng, ylim = rng) +
  geom_abline(color="gray") +
  stat_pointinterval(aes(x = a_item_h, ydist = dr4$a_item),
                     .width = c(0.90),
                     interval_size_range = c(0.4, 0.8),
                     alpha = 0.5) +
  geom_pointinterval(aes(y = mean(dr4$a_item), x = a_item_h,
                         xmin = a_item_h - 1.64*a_item_sd,
                         xmax = a_item_h + 1.64*a_item_sd),
                     interval_size_range = c(0.4, 0.8),
                     alpha = 0.5) +
  labs(x = "lme4", y = "Stan", title = "a_item")

a_respondent_hat <- ranef(fit_lme4)$respondent
a_respondent_sd <- sqrt(as.numeric(attr(a_respondent_hat, "postVar")))
a_respondent_h <- as.numeric(a_respondent_hat$`(Intercept)`)
rng <- range(summarize_draws(
  dr4$a_respondent, ~quantile(.x, probs = c(0.05, 0.95)))[,c("5%","95%")])
ggplot(data=NULL) +
  coord_fixed(xlim = rng, ylim = rng) +
  geom_abline(color="gray") + 
  stat_pointinterval(aes(x = a_respondent_h, ydist = dr4$a_respondent),
                     .width = c(0.90),
                     interval_size_range = c(0.4, 0.8),
                     alpha = 0.1) +
  geom_pointinterval(aes(y = mean(dr4$a_respondent), x = a_respondent_h,
                         xmin = a_respondent_h - 1.64*a_respondent_sd,
                         xmax = a_respondent_h + 1.64*a_respondent_sd),
                     interval_size_range = c(0.4, 0.8),
                     alpha = 0.1) +
  labs(x = "lme4", y = "Stan", title = "a_respondent")

There is not much difference, although Stan estimates have slightly wider range. For most purposes lme4 result would be just fine and computationally faster than full Bayes for this kind of big data.

The wider range of Stan estimates is likely due to effect of integrating over the uncertainty in sigma_item and sigma_respondent:

ggplot(data=NULL) +
  stat_slab(aes(xdist = dr4$sigma_item), density = "unbounded", trim = TRUE, fill = NA, color = "black") +
  scale_y_continuous(breaks = NULL) +
  theme(axis.line.y = element_blank(), strip.text.y = element_blank()) +
  coord_cartesian(expand = FALSE) +
  ## xlim(c(0.005, 0.025)) +
  labs(x = "sigma_item", y = "") +
  geom_vline(xintercept = as.data.frame(VarCorr(fit_lme4))$sdcor[2], color = "black", linetype="dashed")+
  annotate(geom = "text", x = as.data.frame(VarCorr(fit_lme4))$sdcor[2]*1.02, y = .97, hjust=0, label = "lme4 estimate") +
ggplot(data=NULL) +
  stat_slab(aes(xdist = dr4$sigma_respondent), density = "unbounded", trim = TRUE, fill = NA, color = "black") +
  scale_y_continuous(breaks = NULL) +
  theme(axis.line.y = element_blank(), strip.text.y = element_blank()) +
  coord_cartesian(expand = FALSE) +
  ## xlim(c(0.005, 0.025)) +
  labs(x = "sigma_respondent", y = "") +
  geom_vline(xintercept = as.data.frame(VarCorr(fit_lme4))$sdcor[1], color = "black", linetype="dashed") +
  annotate(geom = "text", x = as.data.frame(VarCorr(fit_lme4))$sdcor[1]*1.002, y = .97, hjust=0, label = "lme4 estimate")

References

Hart, H. L. A. 1958. “Positivism and the Separation of Law and Morals.” Harvard Law Review 71: 593–607.

Luu, D. 2024. “Why It’s Impossible to Agree on What’s Allowed.”

Schlag, P. 1999. “No Vehicles in the Park.” Seattle University Law Review 23: 381–89.

Turner, D. 2024. “No Vehicles in the Park.”