Comparison of robit and logit models for binary data. See Chapter 15 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("cmdstanr")
options(mc.cores = 1)
library("ggdist")
logit <- qlogis
invlogit <- plogis
# set the random seed to get reproducible results
# change the seed to experiment with variation due to random noise
set.seed(1234)
N <- 50
x <- runif(N, -9, 9)
a <- 0
b <- 0.8
p <- invlogit(a + b*x)
y <- rbinom(N, 1, p)
df <- 4
data_1 <- list(N=N, x=x, y=y, df=df)
writeLines(readLines(root("Robit","logit.stan")))
data {
int N;
vector[N] x;
int y[N];
}
parameters {
real a;
real b;
}
model {
y ~ bernoulli_logit(a + b*x);
}
writeLines(readLines(root("Robit","robit.stan")))
data {
int N;
vector[N] x;
int y[N];
real df;
}
parameters {
real a;
real b;
}
model {
vector[N] p;
for (n in 1:N) p[n] = student_t_cdf(a + b*x[n], df, 0, sqrt((df - 2)/df));
y ~ bernoulli(p);
}
logit_model <- cmdstan_model("logit.stan")
robit_model <- cmdstan_model("robit.stan")
fit_logit_1 <- logit_model$sample(data=data_1, refresh=0)
Running MCMC with 4 sequential chains...
Chain 1 finished in 0.0 seconds.
Chain 2 finished in 0.0 seconds.
Chain 3 finished in 0.0 seconds.
Chain 4 finished in 0.0 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.0 seconds.
Total execution time: 0.6 seconds.
print(fit_logit_1)
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
lp__ -9.85 -9.49 1.13 0.80 -12.13 -8.76 1.00 1509 1559
a -0.63 -0.60 0.74 0.72 -1.91 0.53 1.00 1725 1888
b 1.44 1.36 0.52 0.47 0.77 2.39 1.00 1737 1364
a_hat_logit_1 <- median(fit_logit_1$draws("a"))
b_hat_logit_1 <- median(fit_logit_1$draws("b"))
fit_robit_1 <- robit_model$sample(data=data_1, refresh=0)
Running MCMC with 4 sequential chains...
Chain 1 finished in 0.3 seconds.
Chain 2 finished in 0.3 seconds.
Chain 3 finished in 0.3 seconds.
Chain 4 finished in 0.3 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.3 seconds.
Total execution time: 1.4 seconds.
print(fit_robit_1)
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
lp__ -10.04 -9.65 1.23 0.89 -12.61 -8.86 1.00 1065 1047
a -0.34 -0.31 0.40 0.38 -1.04 0.27 1.00 1146 1300
b 0.86 0.76 0.42 0.30 0.40 1.66 1.00 1072 1148
a_hat_robit_1 <- median(fit_robit_1$draws("a"))
b_hat_robit_1 <- median(fit_robit_1$draws("b"))
if (savefigs) pdf("logistic2b.pdf", height=4, width=6)
par(mar=c(3,3,2,1), mgp=c(1.5,.5,0), tck=-.01)
plot(data_1$x, data_1$y, yaxt="n", main="Data from a logistic regression", xlab="x", ylab="y")
axis(2, c(0,1))
curve(invlogit(a_hat_logit_1 + b_hat_logit_1*x), add=TRUE, lty=2)
curve(pstudent_t(a_hat_robit_1 + b_hat_robit_1*x, data_1$df, 0, sqrt((data_1$df-2)/data_1$df)), add=TRUE, lty=1)
legend (1, .3, c("fitted logistic regression", "fitted robit regression"), lty=c(2,1), cex=.8)
low_value <- (1:N)[x==sort(x)[4]]
data_2 <- data_1
data_2$y[low_value] <- 1
fit_logit_2 <- logit_model$sample(data=data_2, refresh=0)
Running MCMC with 4 sequential chains...
Chain 1 finished in 0.0 seconds.
Chain 2 finished in 0.0 seconds.
Chain 3 finished in 0.1 seconds.
Chain 4 finished in 0.0 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.0 seconds.
Total execution time: 0.5 seconds.
print(fit_logit_2)
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
lp__ -15.60 -15.25 1.08 0.75 -17.78 -14.59 1.00 1819 2284
a 0.04 0.04 0.51 0.50 -0.77 0.89 1.00 2705 2184
b 0.74 0.72 0.20 0.18 0.46 1.10 1.00 2828 2353
a_hat_logit_2 <- median(fit_logit_2$draws("a"))
b_hat_logit_2 <- median(fit_logit_2$draws("b"))
fit_robit_2 <- robit_model$sample(data=data_2, refresh=0)
Running MCMC with 4 sequential chains...
Chain 1 finished in 0.3 seconds.
Chain 2 finished in 0.3 seconds.
Chain 3 finished in 0.3 seconds.
Chain 4 finished in 0.3 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.3 seconds.
Total execution time: 1.5 seconds.
print(fit_robit_2)
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
lp__ -15.21 -14.89 1.09 0.79 -17.49 -14.16 1.00 1393 1616
a -0.08 -0.07 0.29 0.28 -0.54 0.40 1.00 1580 1860
b 0.47 0.44 0.17 0.14 0.26 0.78 1.00 2021 1484
a_hat_robit_2 <- median(fit_robit_2$draws("a"))
b_hat_robit_2 <- median(fit_robit_2$draws("b"))
Plot
if (savefigs) pdf("logistic2a.pdf", height=4, width=6)
par(mar=c(3,3,2,1), mgp=c(1.5,.5,0), tck=-.01)
plot(data_2$x, data_2$y, yaxt="n", main="Contaminated data", xlab="x", ylab="y")
axis(2, c(0,1))
curve(invlogit(a_hat_logit_2 + b_hat_logit_2*x), add=TRUE, lty=2)
curve(pstudent_t(a_hat_robit_2 + b_hat_robit_2*x, data_2$df, 0, sqrt((data_2$df-2)/data_2$df)), add=TRUE, lty=1)
legend (1, .3, c("fitted logistic regression", "fitted robit regression"), lty=c(2,1), cex=.8)