Building up to a hierarchical model: Coronavirus testing

Author

Andrew Gelman and Aki Vehtari

Published

2022-08-22

Modified

2026-06-15

This notebook includes the code for the Bayesian Workflow book Chapter 19 Building up to a hierarchical model: Coronavirus testing.

1 Introduction

In early April, 2020, Bendavid et al. (2020a) recruited 3330 residents of Santa Clara County, California and tested them for SARS-CoV-2 antibodies. We re-analyze the data.

Load packages

library(rprojroot)
root <- has_file(".Bayesian-Workflow-root")$make_fix_file()
library(cmdstanr)
# CmdStanR output directory makes Quarto cache to work
dir.create(root("coronavirus", "stan_output"), showWarnings = FALSE)
options(cmdstanr_output_dir = root("coronavirus", "stan_output"))
options(mc.cores = 4)
library(posterior)
library(priorsense)
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans", base_size = 14))
library(dplyr)
library(ggh4x)

print_stan_file <- function(file) {
  code <- readLines(file)
  if (isTRUE(getOption("knitr.in.progress")) &
        identical(knitr::opts_current$get("results"), "asis")) {
    # In render: emit as-is so Pandoc/Quarto does syntax highlighting
    block <- paste0("```stan", "\n", paste(code, collapse = "\n"), "\n", "```")
    knitr::asis_output(block)
  } else {
    writeLines(code)
  }
}

Define function to compute shortest posterior interval (SPIN; Liu, Gelman, and Zheng (2015))

spin <- function(x, lower=NULL, upper=NULL, conf=0.95) {
  x <- sort(as.vector(x))
  if (!is.null(lower)) {
    if (lower > min(x)) stop("lower bound is not lower than all the data")
    else x <- c(lower, x)
  }
  if (!is.null(upper)) {
    if (upper < max(x)) stop("upper bound is not higher than all the data")
    else x <- c(x, upper)
  }
  n <- length(x)
  gap <- round(conf*n)
  width <- x[(gap+1):n] - x[1:(n-gap)]
  index <- min(which(width==min(width)))
  x[c(index, index + gap)]
}

2 Simple model fit using pooled specificity and sensitivity

code <- root("coronavirus", "santa-clara.stan")

print_stan_file(code)

data {
  int<lower = 0> y_sample, n_sample, y_spec, n_spec, y_sens, n_sens;
}
parameters {
  real<lower=0, upper = 1> p, spec, sens;
}
transformed parameters {
  real p_sample = p * sens + (1 - p) * (1 - spec);
}
model {
  y_sample ~ binomial(n_sample, p_sample);
  y_spec ~ binomial(n_spec, spec);
  y_sens ~ binomial(n_sens, sens);
}
generated quantities {
  real log_lik = binomial_lpmf(y_sample | n_sample, p_sample);
  real lprior = binomial_lpmf(y_spec | n_spec, spec) +
                binomial_lpmf(y_sens | n_sens, sens);   
}

sc_model <- cmdstan_model(code)

Compute posterior with data from Bendavid et al. (2020a), 11 April 2020

fit_1 <- sc_model$sample(
  data = list(
    y_sample = 50,
    n_sample = 3330,
    y_spec = 369 + 30,
    n_spec = 371 + 30,
    y_sens = 25 + 78,
    n_sens = 37 + 85
  ),
  refresh = 0,
  iter_warmup = 1e4,
  iter_sampling = 1e4
)
print(fit_1, digits = 3)

Check prior-likelihood sensitivity using powerscaling

powerscale_sensitivity(fit_1)

Sensitivity based on cjs_dist
Prior selection: all priors
Likelihood selection: all data

 variable prior likelihood                                diagnosis
        p 0.046      0.055                                        -
     spec 0.094      0.015 potential strong prior / weak likelihood
     sens 0.109      0.001 potential strong prior / weak likelihood
 p_sample 0.007      0.101                                        -

powerscale_plot_dens(
  fit_1,
  variables = c("p", "spec", "sens"),
  lower_alpha = 0.5,
  help_text = FALSE
)

Inference for the population prevalence

draws_1 <- fit_1$draws()
subset <- sample(1e4, 1e3)
x <- as.vector(draws_1[subset, , "spec"])
y <- as.vector(draws_1[subset, , "p"])

par(mar = c(3, 3, 0, 1), mgp = c(2, .7, 0), tck = -.02)
plot(x, y, 
     xlim = c(min(x), 1), ylim = c(0, max(y)), 
     xaxs = "i", yaxs = "i", 
     xlab = expression(paste("Specificity, ", gamma)), 
     ylab = expression(paste("Prevalence, ", pi)), bty = "l", 
     pch=20, cex=.3)

ggplot version

fit_1$draws() |>
  resample_draws(ndraws = 1e3) |>
  mcmc_scatter(pars = c("spec", "p"), size = 1, alpha = 0.5) +
  lims(x = c(NA, 1), y = c(0, NA)) +
  coord_cartesian(expand = c(bottom = FALSE)) +
  labs(x = expression(paste("Specificity, ", gamma)), 
       y = expression(paste("Prevalence, ", pi)))

par(mar = c(3, 3, 0, 1), mgp = c(2, .7, 0), tck = -.02)
hist(y, 
     yaxt = "n", yaxs = "i", 
     xlab = expression(paste("Prevalence, ", pi)), 
     ylab = "", main = "")

fit_1$draws() |>
  mcmc_hist(pars = c("p"), breaks = seq(0, 0.03, length.out = 30)) +
  theme(axis.line.y = element_blank()) + 
  labs(x = expression(paste("Prevalence, ", pi)))

Use the shortest posterior interval, which makes more sense than a central interval because of the skewness of the posterior and the hard boundary at 0.

print(spin(draws_1[, , "p"], lower = 0, upper = 1, conf = 0.95), digits = 2)

[1] 0.0011 0.0186

Compute posterior with data from Bendavid et al. (2020b), 27 April 2020

fit_2 <- sc_model$sample(
  data = list(
    y_sample = 50,
    n_sample = 3330,
    y_spec = 3308,
    n_spec = 3324,
    y_sens = 130,
    n_sens = 157
  ),
  refresh = 0,
  iter_warmup = 1e4,
  iter_sampling = 1e4
)

print(fit_2, digits = 3)
draws_2 <- fit_2$draws()
print(spin(draws_2[, , "p"], lower = 0, upper = 1, conf = 0.95), digits = 2)

Check prior-likelihood sensitivity using powerscaling

powerscale_sensitivity(fit_2)

Sensitivity based on cjs_dist
Prior selection: all priors
Likelihood selection: all data

 variable prior likelihood                                diagnosis
        p 0.030      0.077                                        -
     spec 0.119      0.001 potential strong prior / weak likelihood
     sens 0.101      0.001 potential strong prior / weak likelihood
 p_sample 0.001      0.098                                        -

powerscale_plot_dens(
  fit_2,
  variables = c("p", "spec", "sens"),
  lower_alpha = 0.5,
  help_text = FALSE
)

3 Hierarchical model for sensitivity and specificity

Hierarchical model that allows sensitivity and specificity to vary across studies.

code_hierarchical <- root("coronavirus", "santa-clara-hierarchical.stan")

print_stan_file(code_hierarchical)

data {
  int y_sample;
  int n_sample;
  int J_spec;
  int J_sens;
  array[J_spec] int y_spec;
  array[J_spec] int n_spec;
  array[J_sens] int y_sens;
  array[J_sens] int n_sens;
  real logit_spec_prior_scale;
  real logit_sens_prior_scale;
}
parameters {
  real<lower=0, upper=1> p;
  real mu_logit_spec;
  real mu_logit_sens;
  real<lower=0> sigma_logit_spec;
  real<lower=0> sigma_logit_sens;
  vector<offset=mu_logit_spec, multiplier=sigma_logit_spec>[J_spec] logit_spec;
  vector<offset=mu_logit_sens, multiplier=sigma_logit_sens>[J_sens] logit_sens;
}
transformed parameters {
  vector[J_spec] spec = inv_logit(logit_spec);
  vector[J_sens] sens = inv_logit(logit_sens);
  real p_sample = p * sens[1] + (1 - p) * (1 - spec[1]);
}
model {
  y_sample ~ binomial(n_sample, p_sample);
  y_spec ~ binomial(n_spec, spec);
  y_sens ~ binomial(n_sens, sens);
  logit_spec ~ normal(mu_logit_spec, sigma_logit_spec);
  logit_sens ~ normal(mu_logit_sens, sigma_logit_sens);
  sigma_logit_spec ~ normal(0, logit_spec_prior_scale);
  sigma_logit_sens ~ normal(0, logit_sens_prior_scale);
  mu_logit_spec ~ normal(4, 2); // weak prior on mean of dist of specificity
  mu_logit_sens ~ normal(4, 2); // weak prior on mean of dist of sensitivity
}
generated quantities {
  real log_lik = binomial_lpmf(y_sample | n_sample, p_sample);
  real lprior = normal_lpdf(mu_logit_spec | 4, 2) +
                normal_lpdf(mu_logit_sens | 4, 2); 
}

sc_model_hierarchical <- cmdstan_model(code_hierarchical)

Compute posterior using data from Bendavid et al. (2020b) 27 April 2020

santaclara_data <- list(
  y_sample = 50,
  n_sample = 3330,
  J_spec = 14,
  y_spec = c(0, 368, 30, 70, 1102, 300, 311, 500, 198, 99, 29, 146, 105, 50),
  n_spec = c(0, 371, 30, 70, 1102, 300, 311, 500, 200, 99, 31, 150, 108, 52),
  J_sens = 4,
  y_sens = c(0, 78, 27, 25),
  n_sens = c(0, 85, 37, 35),
  logit_spec_prior_scale = 1,
  logit_sens_prior_scale = 1
)
fit_3a <- sc_model_hierarchical$sample(
  data = santaclara_data,
  refresh = 0,
  iter_warmup = 1e4,
  iter_sampling = 1e4, 
  adapt_delta = 0.99 # to reduce divergences 
)

print_variables <- c("p", "spec[1]", "sens[1]", 
                     "mu_logit_spec", "mu_logit_sens", 
                     "sigma_logit_spec", "sigma_logit_sens")
print(fit_3a, variables = print_variables, digits = 3)

draws_3a <- fit_3a$draws()
print(spin(draws_3a[, , "p"], lower = 0, upper = 1, conf = 0.95), digits = 2)

Check prior-likelihood sensitivity using powerscaling

powerscale_sensitivity(fit_3a, variable = c("p", "sens", "spec")) |>
  print(n = Inf)

Sensitivity based on cjs_dist
Prior selection: all priors
Likelihood selection: all data

 variable prior likelihood                                diagnosis
        p 0.102      0.010 potential strong prior / weak likelihood
  sens[1] 0.061      0.001 potential strong prior / weak likelihood
  sens[2] 0.013      0.001                                        -
  sens[3] 0.014      0.001                                        -
  sens[4] 0.013      0.001                                        -
  spec[1] 0.016      0.022                                        -
  spec[2] 0.005      0.003                                        -
  spec[3] 0.023      0.008                                        -
  spec[4] 0.019      0.003                                        -
  spec[5] 0.020      0.002                                        -
  spec[6] 0.020      0.005                                        -
  spec[7] 0.021      0.003                                        -
  spec[8] 0.020      0.002                                        -
  spec[9] 0.007      0.002                                        -
 spec[10] 0.023      0.003                                        -
 spec[11] 0.002      0.002                                        -
 spec[12] 0.002      0.002                                        -
 spec[13] 0.002      0.002                                        -
 spec[14] 0.003      0.002                                        -

powerscale_plot_dens(
  fit_3a,
  variables = c("p", "spec[1]", "sens[1]"),
  lower_alpha = 0.5,
  help_text = FALSE
) +
  ggh4x::facetted_pos_scales(x=rep(list(scale_x_continuous(limits=c(0, 0.35)),
                                        scale_x_continuous(limits=c(0, 1)),
                                        scale_x_continuous(limits=c(0.98, 1))), 2))

4 Hierarchical model with stronger priors

Compute posterior using data from Bendavid et al. (2020b) 27 April 2020 and stronger priors

santaclara_data$logit_spec_prior_scale <- 0.3
santaclara_data$logit_sens_prior_scale <- 0.3

MCMC gets sometimes stuck on minor mode, so we initialize with Pathfinder

pth_3b <- sc_model_hierarchical$pathfinder(
  data = santaclara_data,
  refresh = 0,
  max_lbfgs_iters = 100,
  num_paths = 10
)
fit_3b <- sc_model_hierarchical$sample(
  data = santaclara_data,
  refresh = 0,
  iter_warmup = 1e4,
  iter_sampling = 1e4, 
  adapt_delta = 0.99, # not as necessary with the stronger priors, but include just in case
  init = pth_3b
)

print(fit_3b, variables = print_variables, digits = 3)

draws_3b <- fit_3b$draws()
print(spin(draws_3b[, , "p"], lower = 0, upper = 1, conf = 0.95), digits = 2)

print(spin(draws_3a[, , "p"], lower = 0, upper = 1, conf = 0.95), digits = 2)
print(spin(draws_3a[, , "spec[1]"], lower = 0, upper = 1, conf = 0.95), digits = 2)
print(spin(draws_3a[, , "sens[1]"], lower = 0, upper = 1, conf = 0.95), digits = 2)
print(spin(draws_3a[, , "mu_logit_spec"], conf = 0.95), digits = 2)
print(spin(draws_3a[, , "mu_logit_sens"], conf = 0.95), digits = 2)
print(spin(draws_3a[, , "sigma_logit_spec"], conf = 0.95), digits = 2)
print(spin(draws_3a[, , "sigma_logit_sens"], conf = 0.95), digits = 2)

print(spin(draws_3b[, , "p"], lower = 0, upper = 1, conf = 0.95), digits = 2)
print(spin(draws_3b[, , "spec[1]"], lower = 0, upper = 1, conf = 0.95), digits = 2)
print(spin(draws_3b[, , "sens[1]"], lower = 0, upper = 1, conf = 0.95), digits = 2)
print(spin(draws_3b[, , "mu_logit_spec"], conf = 0.95), digits = 2)
print(spin(draws_3b[, , "mu_logit_sens"], conf = 0.95), digits = 2)
print(spin(draws_3b[, , "sigma_logit_spec"], conf = 0.95), digits = 2)
print(spin(draws_3b[, , "sigma_logit_sens"], conf = 0.95), digits = 2)

5 MRP model

MRP model allowing prevalence to vary by sex, ethnicity, age category, and zip code. Model is set up to use the ethnicity, age, and zip categories of Bendavid et al. (2020b)

code_model_hierarchical_mrp <- root("coronavirus", "santa-clara-hierarchical-mrp.stan")

print_stan_file(code_model_hierarchical_mrp)

data {
  int<lower = 0> N;  // number of tests in the sample (3330 for Santa Clara)
  array[N] int<lower = 0, upper = 1> y;  // 1 if positive, 0 if negative
  vector<lower = 0, upper = 1>[N] male;  // 0 if female, 1 if male
  array[N] int<lower = 1, upper = 4> eth;  // 1=white, 2=asian, 3=hispanic, 4=other
  array[N] int<lower = 1, upper = 4> age;  // 1=0-4, 2=5-18, 3=19-64, 4=65+
  int<lower = 0> N_zip;  // number of zip codes (58 in this case)
  array[N] int<lower = 1, upper = N_zip> zip;  // zip codes 1 through 58
  vector[N_zip] x_zip;  // predictors at the zip code level
  int<lower = 0> J_spec;
  array[J_spec] int<lower = 0> y_spec;
  array[J_spec] int<lower = 0> n_spec;
  int<lower = 0> J_sens;
  array[J_sens] int<lower = 0> y_sens;
  array[J_sens] int<lower = 0> n_sens;
  int<lower = 0> J;  // number of population cells, J = 2*4*4*58
  vector<lower = 0>[J] N_pop;  // population sizes for poststratification
  real<lower = 0> coef_prior_scale;
  real<lower = 0> logit_spec_prior_scale;
  real<lower = 0> logit_sens_prior_scale;
}
parameters {
  real mu_logit_spec;
  real mu_logit_sens;
  real<lower = 0> sigma_logit_spec;
  real<lower = 0> sigma_logit_sens;
  vector<offset = mu_logit_spec, multiplier = sigma_logit_spec>[J_spec] logit_spec;
  vector<offset = mu_logit_sens, multiplier = sigma_logit_sens>[J_sens] logit_sens;
  vector[3] b;  // intercept, coef for male, and coef for x_zip
  real<lower = 0> sigma_eth;
  real<lower = 0> sigma_age;
  real<lower = 0> sigma_zip;
  vector<multiplier = sigma_eth>[4] a_eth;  // varying intercepts for ethnicity
  vector<multiplier = sigma_age>[4] a_age;  // varying intercepts for age category
  vector<multiplier = sigma_zip>[N_zip] a_zip;  // varying intercepts for zip code
}
transformed parameters {
  vector[J_spec] spec = inv_logit(logit_spec);
  vector[J_sens] sens = inv_logit(logit_sens);
}
model {
  vector[N] p = inv_logit(b[1]
                          + b[2] * male
                          + b[3] * x_zip[zip]
                          + a_eth[eth]
                          + a_age[age]
                          + a_zip[zip]);
  vector[N] p_sample = p * sens[1] + (1 - p) * (1 - spec[1]);
  y ~ bernoulli(p_sample);
  y_spec ~ binomial(n_spec, spec);
  y_sens ~ binomial(n_sens, sens);
  logit_spec ~ normal(mu_logit_spec, sigma_logit_spec);
  logit_sens ~ normal(mu_logit_sens, sigma_logit_sens);
  sigma_logit_spec ~ normal(0, logit_spec_prior_scale);
  sigma_logit_sens ~ normal(0, logit_sens_prior_scale);
  mu_logit_spec ~ normal(4, 2);  // weak prior on mean of distribution of spec
  mu_logit_sens ~ normal(4, 2);  // weak prior on mean of distribution of sens
  a_eth ~ normal(0, sigma_eth);
  a_age ~ normal(0, sigma_age);
  a_zip ~ normal(0, sigma_zip);
  // prior on centered intercept
  b[1] + b[2] * mean(male) + b[3] * mean(x_zip[zip]) ~ logistic(0, 1);
  b[2] ~ normal(0, coef_prior_scale);
  b[3] ~ normal(0, coef_prior_scale / sd(x_zip[zip]));  // prior on scaled coef
  sigma_eth ~ normal(0, coef_prior_scale);
  sigma_age ~ normal(0, coef_prior_scale);
  sigma_zip ~ normal(0, coef_prior_scale);
}
generated quantities {
  real p_avg;
  vector[J] p_pop;  // population prevalence in the J poststratification cells
  int count;
  count = 1;
  for (i_zip in 1:N_zip) {
    for (i_age in 1:4) {
      for (i_eth in 1:4) {
        for (i_male in 0:1) {
          p_pop[count] = inv_logit(b[1]
                                   + b[2] * i_male
                                   + b[3] * x_zip[i_zip]
                                   + a_eth[i_eth]
                                   + a_age[i_age]
                                   + a_zip[i_zip]);
          count += 1;
        }
      }
    }
  }
  p_avg = sum(N_pop .* p_pop) / sum(N_pop);
}

sc_model_hierarchical_mrp <- cmdstan_model(code_model_hierarchical_mrp)

To fit the model, we need individual-level data.
These data are not publicly available, so just to get the program running, we take the existing 50 positive tests and assign them at random to the 3330 people.

N <- 3330
y <- sample(rep(c(0, 1), c(3330 - 50, 50)))
n <- rep(1, 3330)

Here are the counts of each sex, ethnicity, and age from Bendavid et al. (2020b).
We don’t have zip code distribution but we looked it up and there are 58 zip codes in Santa Clara County; for simplicity we assume all zip codes are equally likely.
We then assign these traits to people at random.
This is wrong–actually, these variable are correlated in various ways–but, again, now we have fake data we can use to fit the model.

male <- sample(rep(c(0, 1), c(2101, 1229)))
eth <- sample(rep(1:4, c(2118, 623, 266, 306 + 17)))
age <- sample(rep(1:4, c(71, 550, 2542, 167)))
N_zip <- 58
zip <- sample(1:N_zip, 3330, replace = TRUE)

Setting up the zip code level predictor.
In this case we will use a random number with mean 50 and standard deviation 20.
These are arbitrary numbers that we chose just to be able to test the centering and scaling in the model.
In real life we might use %Latino or average income in the zip code

x_zip <- rnorm(N_zip, 50, 20)

Setting up the poststratification table.
For simplicity we assume there are 1000 people in each cell in the county.
Actually we’d want data from the Census.

J <- 2 * 4 * 4 * N_zip
N_pop <- rep(NA, J)
count <- 1
for (i_zip in 1:N_zip){
  for (i_age in 1:4){
    for (i_eth in 1:4){
      for (i_male in 0:1){
        N_pop[count] <- 1000
        count = count + 1
      }
    }
  }
}

Put together the data and fit the model

santaclara_mrp_data <- list(
  N = N,
  y = y,
  male = male,
  eth = eth,
  age = age,
  zip = zip,
  N_zip = N_zip,
  x_zip = x_zip,
  J_spec = 14,
  y_spec = c(0, 368, 30, 70, 1102, 300, 311, 500, 198, 99, 29, 146, 105, 50),
  n_spec = c(0, 371, 30, 70, 1102, 300, 311, 500, 200, 99, 31, 150, 108, 52),
  J_sens = 4,
  y_sens = c(0, 78, 27, 25),
  n_sens = c(0, 85, 37, 35),
  logit_spec_prior_scale = 0.3,
  logit_sens_prior_scale = 0.3,
  coef_prior_scale = 0.5,
  J = J,
  N_pop = N_pop
)
fit_4 <- sc_model_hierarchical_mrp$sample(data = santaclara_mrp_data)

Show inferences for some model parameters. In addition to p_avg, the population prevalence, we also look at the inferences for the first three poststratification cells just to check that everything makes sense

print_variables <- c("p_avg", "b", "a_age", "a_eth", 
                     "sigma_eth", "sigma_age", "sigma_zip", 
                     "mu_logit_spec", "sigma_logit_spec",  
                     "mu_logit_sens", "sigma_logit_sens", 
                     "p_pop[1]", "p_pop[2]", "p_pop[3]")
print(fit_4, variables = print_variables, max_rows = 100)

         variable  mean median   sd  mad    q5   q95 rhat ess_bulk ess_tail
 p_avg             0.01   0.01 0.01 0.01  0.00  0.02 1.00     1662     1355
 b[1]             -3.97  -3.75 1.24 0.90 -6.42 -2.43 1.00     1360      775
 b[2]             -0.25  -0.24 0.38 0.37 -0.88  0.38 1.00     4879     2704
 b[3]             -0.02  -0.02 0.02 0.02 -0.05  0.01 1.00     1969     1323
 a_age[1]         -0.02   0.00 0.39 0.20 -0.67  0.53 1.00     4962     3082
 a_age[2]         -0.01   0.00 0.36 0.20 -0.59  0.54 1.00     4940     3325
 a_age[3]         -0.11  -0.05 0.35 0.19 -0.75  0.34 1.00     4728     3368
 a_age[4]         -0.01   0.00 0.38 0.21 -0.62  0.56 1.00     4961     3140
 a_eth[1]         -0.05  -0.01 0.35 0.20 -0.67  0.45 1.00     4570     3391
 a_eth[2]         -0.18  -0.07 0.40 0.23 -0.94  0.31 1.00     3177     3517
 a_eth[3]          0.15   0.06 0.40 0.23 -0.37  0.92 1.00     4033     3649
 a_eth[4]         -0.10  -0.03 0.39 0.21 -0.79  0.42 1.00     4836     3300
 sigma_eth         0.35   0.29 0.27 0.27  0.02  0.85 1.00     2633     1722
 sigma_age         0.32   0.27 0.26 0.23  0.03  0.86 1.00     3230     2308
 sigma_zip         0.39   0.35 0.27 0.28  0.03  0.89 1.00     1947     1937
 mu_logit_spec     5.19   5.19 0.33 0.33  4.65  5.75 1.00     2828     2891
 sigma_logit_spec  0.72   0.72 0.21 0.20  0.37  1.07 1.00     2219     1724
 mu_logit_sens     1.55   1.55 0.33 0.31  1.01  2.09 1.00     2877     2795
 sigma_logit_sens  0.40   0.39 0.20 0.19  0.08  0.73 1.00     1741     1036
 p_pop[1]          0.01   0.01 0.01 0.01  0.00  0.03 1.00     2001     1700
 p_pop[2]          0.01   0.01 0.01 0.01  0.00  0.02 1.00     2198     1923
 p_pop[3]          0.01   0.01 0.01 0.01  0.00  0.03 1.00     2013     1842

draws_4 <- fit_4$draws()
print(spin(draws_4[, , "p_avg"], lower = 0, upper = 1, conf = 0.95), digits = 2)

[1] 0.00058 0.02245

6 Additional prior sensitivity analysis

pos_tests <- c(78, 27, 25)
tests <- c(85, 37, 35)
sens_df <- data.frame(pos_tests, tests, sample_sens = pos_tests / tests)

neg_tests <- c(368, 30, 70, 1102, 300, 311, 500, 198, 99, 29, 146, 105, 50)
tests <- c(371, 30, 70, 1102, 300, 311, 500, 200, 99, 31, 150, 108, 52)
spec_df <- data.frame(neg_tests, tests, sample_spec = neg_tests / tests)

pos_tests <- 50
tests <- 3330
unk_df <- data.frame(pos_tests, tests, sample_prev = pos_tests / tests)

Stan model for prior sensitivity analysis

code_sens <- root("coronavirus", "prior-sensitivity.stan")

print_stan_file(code_sens)

data {
  int<lower = 0> K_pos;
  array[K_pos] int<lower = 0> N_pos;
  array[K_pos] int<lower = 0> n_pos;

  int<lower = 0> K_neg;
  array[K_neg] int<lower = 0> N_neg;
  array[K_neg] int<lower = 0> n_neg;

  int<lower = 0> K_unk;
  array[K_unk] int<lower = 0> N_unk;
  array[K_unk] int<lower = 0> n_unk;

  real<lower = 0> sigma_sigma_logit_sens;
  real<lower = 0> sigma_sigma_logit_spec;
}
parameters {
  real mu_logit_sens;
  real mu_logit_spec;
  real<lower = 0> sigma_logit_sens;
  real<lower = 0> sigma_logit_spec;

  vector<offset = mu_logit_sens,
         multiplier = sigma_logit_sens>[K_pos] logit_sens;
  vector<offset = mu_logit_spec,
         multiplier = sigma_logit_spec>[K_neg] logit_spec;
  vector<offset = mu_logit_sens,
         multiplier = sigma_logit_sens>[K_unk] logit_sens_unk;
  vector<offset = mu_logit_spec,
         multiplier = sigma_logit_spec>[K_unk] logit_spec_unk;

  real<lower = 0, upper = 1> pi;
}
transformed parameters {
  vector[K_pos] sens = inv_logit(logit_sens);
  vector[K_neg] spec = inv_logit(logit_spec);
  vector[K_unk] sens_unk = inv_logit(logit_sens_unk);
  vector[K_unk] spec_unk = inv_logit(logit_spec_unk);
}
model {
  mu_logit_sens ~ normal(4, 2);
  sigma_logit_sens ~ normal(0, sigma_sigma_logit_sens);
  logit_sens ~ normal(mu_logit_sens, sigma_logit_sens);
  n_pos ~ binomial_logit(N_pos, logit_sens);

  mu_logit_spec ~ normal(4, 2);
  sigma_logit_spec ~ normal(0, sigma_sigma_logit_spec);
  logit_spec ~ normal(mu_logit_spec, sigma_logit_spec);
  n_neg ~ binomial_logit(N_neg, logit_spec);

  pi ~ uniform(0, 1);
  logit_sens_unk ~ normal(mu_logit_sens, sigma_logit_sens);
  logit_spec_unk ~ normal(mu_logit_spec, sigma_logit_spec);
  n_unk ~ binomial(N_unk, pi * sens_unk + (1 - pi) * (1 - spec_unk));
}

model_sens <- cmdstan_model(code_sens)

Data

data <- list(
  K_pos = nrow(sens_df),
  N_pos = array(sens_df$tests),
  n_pos = array(sens_df$pos_tests),
  K_neg = nrow(spec_df),
  N_neg = array(spec_df$tests),
  n_neg = array(spec_df$neg_tests),
  K_unk = nrow(unk_df),
  N_unk = array(unk_df$tests),
  n_unk = array(unk_df$pos_tests)
)

Empty data frame to store posterior intervals with different priors

ribbon_df <- data.frame(
  sigma_sens = c(),
  sigma_spec = c(),
  prev05 = c(),
  prev50 = c(),
  prev95 = c()
)

Loop over different prior parameter values

sigma_senss <- c(0.01, 0.25, 0.5, 0.75, 1)
sigma_specss <- c(0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
for (sigma_sens in sigma_senss) {
  for (sigma_spec in sigma_specss) {
    data2 <- append(data, list(sigma_sigma_logit_sens = sigma_sens,
                               sigma_sigma_logit_spec = sigma_spec))
    fit <-  model_sens$sample(
      data = data2,
      iter_warmup = 5e4,
      iter_sampling = 5e4,
      seed = 1234,
      refresh = 0,
      adapt_delta = 0.95,
      show_messages = FALSE, 
      show_exceptions = FALSE
    )
    pis <- fit$draws()[, , "pi"]
    ribbon_df <- rbind(ribbon_df,
                       data.frame(sigma_sens = paste("sensitivity hyperprior", "=", sigma_sens),
                                  sigma_spec = sigma_spec,
                                  prev05 = quantile(pis, 0.05),
                                  prev50 = quantile(pis, 0.5),
                                  prev95 = quantile(pis, 0.95)))
  }
}

ggplot(ribbon_df, aes(x = sigma_spec)) +
  facet_wrap(~ sigma_sens, nrow = 1) +
  geom_ribbon(aes(ymin = prev05, ymax = prev95), fill = "gray95") +
  geom_line(aes(y = prev50), linewidth = 0.5) +
  geom_line(aes(y = prev05), color = "darkgray", linewidth = 0.25) +
  geom_line(aes(y = prev95), color = "darkgray", linewidth = 0.25) +
  scale_y_log10(limits = c(0.0009, 1.1), breaks = c(0.001, 0.01, 0.1, 1)) +
  scale_x_continuous(expand = c(0, 0), lim = c(0, 1),
                     breaks = c(0.1, 0.3, 0.5, 0.7, 0.9)) +
  ylab("prevalence") +
  xlab("specificity hyperprior") +
  theme_bw() +
  theme(panel.spacing = unit(0.25, "lines"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.background = element_blank())

References

Bendavid, Eran et al. 2020a. “COVID-19 Antibody Seroprevalence in Santa Clara County, California.” medrXiv Preprint medrXiv:10.1101/2020.04.14.20062463v1/.

——— et al. 2020b. “COVID-19 Antibody Seroprevalence in Santa Clara County, California, Version 2.” medrXiv Preprint medrXiv:10.1101/2020.04.14.20062463v2/.

Liu, Y., A. Gelman, and T. Zheng. 2015. “Simulation-Efficient Shortest Probability Intervals.” Statistics and Computing 25: 809–19.

1 Introduction

2 Simple model fit using pooled specificity and sensitivity

3 Hierarchical model for sensitivity and specificity

4 Hierarchical model with stronger priors

5 MRP model

6 Additional prior sensitivity analysis

References

Licenses