library(rprojroot)
root <- has_file(".Bayesian-Workflow-root")$make_fix_file()
library(cmdstanr)
library(brms)
options(brms.backend = "cmdstanr", mc.cores = 4)
library(posterior)
library(ggplot2)
library(ggdist)
library(dplyr)
library(tidyr)
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans", base_size = 16))
library(marginaleffects)
print_stan_file <- function(file) {
code <- readLines(file)
if (isTRUE(getOption("knitr.in.progress")) &
identical(knitr::opts_current$get("results"), "asis")) {
# In render: emit as-is so Pandoc/Quarto does syntax highlighting
block <- paste0("```stan", "\n", paste(code, collapse = "\n"), "\n", "```")
knitr::asis_output(block)
} else {
writeLines(code)
}
}Bioassay case study
This notebook includes the code for Bayesian Workflow book Section 3.5 A simple example of probabilistic programming.
1 Introduction
We demonstrate the role of statistical programming environments and probabilistic programming by going through the steps of data analysis and computation in the context of a simple example.
We work through an example by Racine-Poon et al. (1986), also included in Section 2.8 of *Bayesian Data Analysis (Gelman et al. 2013), of a logistic regression fit to a bioassay experiment.
Load packages
2 Bioassay data
Read the data from csv
# df_bioassay <- read.csv("bioassay/data/bioassay.csv")Data as data frame
df_bioassay <- data.frame(
dose = c(-0.86, -0.30, -0.05, 0.73),
batch_size = c(5, 5, 5, 5),
deaths = c(0, 1, 3, 5)
)Data plotted with base R
with(df_bioassay,
plot(dose, deaths, xlab = "Dose log(g/ml)", ylab = "# of deaths",
pch = 19, cex = 1.5, bty = "l"))Data plotted with ggplot
df_bioassay |>
ggplot(aes(x = dose, y = deaths)) +
geom_point(size = 3) +
labs(x = "Dose log(g/ml)", y = "# of deaths")Data as list for CmdStanR
bioassay_data <- with(df_bioassay,
list(J = nrow(df_bioassay),
x = dose,
N = batch_size,
y = deaths))3 Stan models and inference
Stan model 0 (without priors)
bioassay_stan_file <- root("bioassay","bioassay0.stan")print_stan_file(bioassay_stan_file)data {
int<lower=0> J;
vector[J] x;
array[J] int<lower=0> N;
array[J] int<lower=0, upper=N> y;
}
parameters {
real a;
real b;
}
model {
y ~ binomial_logit(N, a + b * x);
}Compile the Stan model code using pedantic mode
mod0 <- cmdstan_model(bioassay_stan_file, pedantic = TRUE)Stan model 1 (with priors)
bioassay_stan_file <- root("bioassay","bioassay1.stan")print_stan_file(bioassay_stan_file)data {
int<lower=0> J;
vector[J] x;
array[J] int<lower=0> N;
array[J] int<lower=0, upper=N> y;
}
parameters {
real a;
real<lower=0> b;
}
model {
{a, b} ~ normal(0, 5);
y ~ binomial_logit(N, a + b * x);
}Compile the updated Stan model code using pedantic mode
mod1 <- cmdstan_model(bioassay_stan_file, pedantic = TRUE)Sample with default settings (expect no progress output)
fit1 <- mod1$sample(data = bioassay_data, refresh = 0)4 Posterior summary
Posterior summary and MCMC diagnostics
fit1 variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
lp__ -5.89 -5.59 1.00 0.71 -7.85 -4.96 1.00 1898 2299
a 0.57 0.53 0.77 0.75 -0.64 1.92 1.00 1769 1700
b 6.31 6.00 2.50 2.42 2.82 10.97 1.00 1785 2107
Posterior draws as data frame
draws1 <- fit1$draws(format="df")Plot with base plot: data, 20 logistic curves given 20 posterior draws, and one logistic given the posterior mean
par(mar=c(3,3,1,1), mgp=c(1.5,.5,0), tck=-.01)
with(df_bioassay,
plot(dose, deaths/batch_size, xlab = "Dose log(g/ml)", ylab = "Pr (death)",
pch=19, cex=1.5, bty="l"))
invlogit <- plogis
for (s in sample(nrow(draws1), 20)) {
curve(invlogit(draws1$a[s] + draws1$b[s] * x),
col = "red", lwd = 0.5, add = TRUE)
}
curve(invlogit(mean(draws1$a) + mean(draws1$b) * x),
col = "blue", lwd = 2, add = TRUE)Plot with ggplot: data, 20 logistic curves given 20 posterior draws, and one logistic given the posterior mean
draws1 |>
resample_draws(ndraws = 20) |>
expand_grid(x=seq(-1, 1, length = 100)) |>
mutate(y = plogis(a + b * x)) |>
ggplot() +
geom_point(data = df_bioassay, aes(x = dose, y = deaths / batch_size), size = 3) +
geom_line(aes(x = x, y = y, group = .draw), alpha = .5, color = "red") +
geom_function(fun = \(x) plogis(mean(draws1$a) + mean(draws1$b)*x),
color = "blue",
linewidth = 1) +
labs(x = "Dose log(g/ml)", y = "Pr(death)")5 Derived quantities
Compute posterior draws for LD50 in log(g/ml) and in mg/ml
draws1 <- draws1 |>
mutate_variables(LD50_log_g_ml = -a / b,
LD50_mg_ml = 1000 * exp(LD50_log_g_ml))Alternatively using base R (as posterior package draws object is more than just plain data frame, it is better to use mutate_variables(), which will also work for all other posterior draws objects (draws_list, draws_array, draws_rvars))
draws1$LD50_log_g_ml <- -draws1$a / draws1$b
draws1$LD50_mg_ml <- 1000 * exp(draws1$LD50_log_g_ml)Summarise LD50 posterior
draws1 |>
subset_draws(variable = "LD50_log_g_ml") |>
summarize_draws()# A tibble: 1 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 LD50_log_g_ml -0.0801 -0.0907 0.133 0.112 -0.277 0.158 1.00 2515. 2514.
Common part for the next two LD50 plots (without data)
ggplot_LD50_mg_ml <-
ggplot(mapping = aes(x = LD50_mg_ml)) +
scale_x_log10(limits = c(375,2700)) +
labs(x = "LD50 mg/ml") +
geom_vline(xintercept = c(500, 2000), alpha=0.1) +
annotate(geom = "text", x = 500*0.96, y = .97, hjust = 1, label = "Category 3") +
annotate(geom = "text", x = 1000, y = .97, label = "Category 4") +
annotate(geom = "text", x = 2000*1.04, y = .97, hjust = 0, label = "Category 5") +
theme_sub_axis_y(text = element_blank(),
line = element_blank(),
ticks = element_blank(),
title = element_blank())Quantile dot plot of the LD50 (mg/ml) posterior
ggplot_LD50_mg_ml +
stat_dots(data = draws1, quantiles = 100) +
coord_cartesian(expand = c(bottom = FALSE))Kernel density plot of the LD50 (mg/ml) posterior
ggplot_LD50_mg_ml +
stat_slab(data = draws1, color = "gray", fill = NA) +
coord_cartesian(expand = c(bottom = FALSE))6 brms model and inference
The same model with brms
bfit1 <- brm(deaths | trials(batch_size) ~ dose,
family = binomial(),
prior = c(prior(normal(0, 5), class = Intercept),
prior(normal(0, 5), lb = 0, class = b)),
data = df_bioassay,
refresh = 0)Summary of the inference
bfit1 Family: binomial
Links: mu = logit
Formula: deaths | trials(batch_size) ~ dose
Data: df_bioassay (Number of observations: 4)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept 0.60 0.79 -0.87 2.21 1.00 2348 2369
dose 6.42 2.48 2.53 12.11 1.00 2867 1914
Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Posterior draws as data frame
bdraws1 <- as_draws_df(bfit1)Plot data, 20 logistic curves given 20 posterior draws, and one logistic given the posterior mean
bdraws1 |>
resample_draws(ndraws = 20) |>
expand_grid(x=seq(-1, 1, length = 100)) |>
mutate(y = plogis(b_Intercept + b_dose * x)) |>
ggplot() +
geom_line(aes(x = x, y = y, group = .draw), alpha = .5, color = "red") +
geom_point(data = df_bioassay, aes(x = dose, y = deaths / batch_size), size = 3) +
geom_function(fun = \(x) plogis(mean(bdraws1$b_Intercept) + mean(bdraws1$b_dose)*x),
color = "blue",
linewidth = 1) +
labs(x = "Dose log(g/ml)", y = "Pr(death)")brms provides also shortcut for plotting the posterior mean and uncertainty. We use plot=FALSE and [[1]] to return a ggplot object, so that we can modify the axes labels.
p1 <- plot(conditional_effects(bfit1), plot=FALSE)[[1]] +
labs(x = "Dose log(g/ml)", y = "Pr(death)")
p1We can add data points with ggplot::geom_point().
p1 +
geom_point(data = df_bioassay,
inherit.aes = FALSE,
aes(x = dose, y = deaths / batch_size),
size = 3)We can draw also individual posterior curves with spaghetti=TRUE, ndraws=20, but then the posterior mean would be computed only from 20 curves. Increasing ndraws would make the plotted posterior mean more accurate, but would make the spaghetti plot more messy.
p1 <- plot(conditional_effects(bfit1, spaghetti=TRUE, ndraws=20),
plot=FALSE)[[1]] +
labs(x = "Dose log(g/ml)", y = "Pr(death)")
p1marginaleffects package also provides function for plotting the posterior prediction and uncertainty. By default it predicts on the scale of actual outcome, but as all batch sizes are equal we can transform to the interval from 0 to 1. We need to add the observed proportions of deaths with geom_point().
p2 <- marginaleffects::plot_predictions(bfit1,
condition = "dose",
transform = \(x) x/5) +
labs(x = "Dose log(g/ml)", y = "Pr(death)") +
geom_point(data = df_bioassay,
inherit.aes = FALSE,
aes(x = dose, y = deaths / batch_size),
size = 3)
p2The brms and marginaleffects plotting functions demonstrate that often shortcut functions can provide quick plot or summary, but to have more control you may still need write more code to get what you really want.
7 LD50 posterior
Compute posterior draws for lethal dose 50 (LD50) in log(g/ml) and in mg/ml
bdraws1 <- bdraws1 |>
mutate_variables(LD50_log_g_ml = -b_Intercept/b_dose,
LD50_mg_ml = 1000 * exp(LD50_log_g_ml))Posterior summary
bdraws1 |>
subset_draws(variable = "LD50_log_g_ml") |>
summarize_draws()# A tibble: 1 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 LD50_log_g_ml -0.0829 -0.0946 0.131 0.112 -0.275 0.151 1.00 2661. 2489.
Quantile dot plot of the LD50 (mg/ml) posterior.
ggplot_LD50_mg_ml +
stat_dots(data = bdraws1, quantiles = 100) +
coord_cartesian(expand = c(bottom = FALSE))Kernel density plot of the LD50 (mg/ml) posterior.
ggplot_LD50_mg_ml +
stat_slab(data = bdraws1, color = "gray", fill = NA) +
coord_cartesian(expand = c(bottom = FALSE))References
Licenses
- Code © 2025, Andrew Gelman and Aki Vehtari, licensed under BSD-3.
- Text © 2025, Andrew Gelman and Aki Vehtari, licensed under CC-BY-NC 4.0.