Logistic regression, identifiability, and separation. See Chapters 13 and 14 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("arm")
library("rstanarm")
library("foreign")
nes <- read.table(root("NES/data","nes.txt"), header=TRUE)
head(nes)
year resid weight1 weight2 weight3 age gender race educ1 urban region
536 1952 1 1 1 1 25 2 1 2 2 1
537 1952 2 1 1 1 33 2 1 1 2 1
538 1952 3 1 1 1 26 2 1 2 2 1
539 1952 4 1 1 1 63 1 1 2 2 1
540 1952 5 1 1 1 66 2 1 2 2 2
541 1952 6 1 1 1 48 2 1 2 2 2
income occup1 union religion educ2 educ3 martial_status occup2 icpsr_cty
536 4 2 1 1 3 3 1 2 NA
537 4 6 1 1 1 1 1 6 NA
538 3 6 2 2 3 3 1 6 NA
539 3 3 1 1 2 2 1 3 NA
540 1 6 2 1 4 4 1 6 NA
541 4 6 1 1 2 2 1 6 NA
fips_cty partyid7 partyid3 partyid3_b str_partyid father_party mother_party
536 NA 6 3 3 3 3 3
537 NA 5 3 3 2 2 2
538 NA 4 2 2 1 1 1
539 NA 7 3 3 4 1 NA
540 NA 7 3 3 4 1 1
541 NA 3 1 1 2 1 1
dlikes rlikes dem_therm rep_therm regis vote regisvote presvote
536 0 1 NA NA 2 2 3 2
537 -1 3 NA NA 2 2 3 1
538 0 5 NA NA 2 2 3 2
539 -1 3 NA NA 1 2 3 2
540 -2 0 NA NA 2 2 3 2
541 0 4 NA NA 2 2 3 2
presvote_2party presvote_intent ideo_feel ideo7 ideo cd state inter_pre
536 2 2 NA NA NA NA 13 50
537 1 2 NA NA NA NA 13 50
538 2 2 NA NA NA NA 13 50
539 2 2 NA NA NA NA 13 50
540 2 2 NA NA NA NA 24 49
541 2 2 NA NA NA NA 24 49
inter_post black female age_sq rep_presvote rep_pres_intent south real_ideo
536 NA 0 1 625 1 1 0 NA
537 NA 0 1 1089 0 1 0 NA
538 NA 0 1 676 1 1 0 NA
539 NA 0 0 3969 1 1 0 NA
540 NA 0 1 4356 1 1 0 NA
541 NA 0 1 2304 1 1 0 NA
presapprov perfin1 perfin2 perfin presadm age_10 age_sq_10 newfathe newmoth
536 NA NA NA NA -1 2.5 6.250000 1 1
537 NA NA NA NA -1 3.3 10.889999 0 0
538 NA NA NA NA -1 2.6 6.759999 -1 -1
539 NA NA NA NA -1 6.3 39.690002 -1 NA
540 NA NA NA NA -1 6.6 43.559998 -1 -1
541 NA NA NA NA -1 4.8 23.040001 -1 -1
parent_party white year_new income_new age_new vote.1 age_discrete
536 2 1 1 1 -2.052455 1 1
537 0 1 1 1 -1.252455 1 2
538 -2 1 1 0 -1.952455 1 1
539 NA 1 1 0 1.747545 1 3
540 -2 1 1 -2 2.047545 1 4
541 -2 1 1 1 0.247545 1 3
race_adj dvote rvote
536 1 0 1
537 1 1 0
538 1 0 1
539 1 0 1
540 1 0 1
541 1 0 1
Use first only data from 1992 and remove missing data
ok <- nes$year==1992 & !is.na(nes$rvote) & !is.na(nes$dvote) & (nes$rvote==1 | nes$dvote==1)
nes92 <- nes[ok,]
fit_1 <- stan_glm(rvote ~ income, family=binomial(link="logit"), data=nes92,
refresh=0)
print(fit_1)
stan_glm
family: binomial [logit]
formula: rvote ~ income
observations: 1179
predictors: 2
------
Median MAD_SD
(Intercept) -1.4 0.2
income 0.3 0.1
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
new <- data.frame(income=5)
Predict vote preference point estimate
pred <- predict(fit_1, type="response", newdata=new)
print(pred, digits=2)
1
0.56
Linear predictor with uncertainty
linpred <- posterior_linpred(fit_1, newdata=new)
head(linpred)
iterations 1
[1,] 0.25306873
[2,] 0.27232814
[3,] 0.12618496
[4,] 0.07759585
[5,] 0.30738049
[6,] 0.25514201
print(c(mean(linpred), sd(linpred)), digits=2)
[1] 0.23 0.12
Expected outcome with uncertainty
epred <- posterior_epred(fit_1, newdata=new)
head(epred)
iterations 1
[1,] 0.5629317
[2,] 0.5676644
[3,] 0.5315044
[4,] 0.5193892
[5,] 0.5762457
[6,] 0.5634417
print(c(mean(epred), sd(epred)), digits=2)
[1] 0.56 0.03
Predictive distribution for a new observation
postpred <- posterior_predict(fit_1, newdata=new)
head(postpred)
1
[1,] 1
[2,] 1
[3,] 0
[4,] 0
[5,] 1
[6,] 1
print(c(mean(postpred), sd(postpred)), digits=2)
[1] 0.54 0.50
new <- data.frame(income=1:5)
pred <- predict(fit_1, type="response", newdata=new)
linpred <- posterior_linpred(fit_1, newdata=new)
head(linpred)
iterations 1 2 3 4 5
[1,] -1.1880617 -0.8277791 -0.4674965 -0.10721389 0.25306873
[2,] -0.9849574 -0.6706360 -0.3563146 -0.04199324 0.27232814
[3,] -0.7970669 -0.5662540 -0.3354410 -0.10462802 0.12618496
[4,] -1.0633428 -0.7781081 -0.4928735 -0.20763881 0.07759585
[5,] -1.0605631 -0.7185772 -0.3765913 -0.03460540 0.30738049
[6,] -0.9290029 -0.6329667 -0.3369304 -0.04089422 0.25514201
epred <- posterior_epred(fit_1, newdata=new)
head(epred)
iterations 1 2 3 4 5
[1,] 0.2336058 0.3041149 0.3852090 0.4732222 0.5629317
[2,] 0.2719092 0.3383544 0.4118520 0.4895032 0.5676644
[3,] 0.3106533 0.3621017 0.4169173 0.4738668 0.5315044
[4,] 0.2566712 0.3147278 0.3792169 0.4482760 0.5193892
[5,] 0.2572019 0.3277064 0.4069493 0.4913495 0.5762457
[6,] 0.2831270 0.3468382 0.4165553 0.4897779 0.5634417
postpred <- posterior_predict(fit_1, newdata=new)
head(postpred)
1 2 3 4 5
[1,] 0 0 1 0 1
[2,] 0 1 1 1 0
[3,] 1 0 0 1 0
[4,] 0 0 1 0 1
[5,] 0 0 0 0 0
[6,] 0 0 0 0 1
the posterior probability, according to the fitted model, that Bush was more popular among people with income level 5 than among people with income level 4
mean(epred[,5] > epred[,4])
[1] 1
95% posterior distribution for the difference in support for Bush, comparing people in the richest to the second-richest category
quantile(epred[,5] - epred[,4], c(0.025, 0.975))
2.5% 97.5%
0.05382157 0.10976404
data <- data.frame(rvote=rep(c(0,1), 10), income=1:20)
fit_f <- stan_glm(rvote ~ income, family=binomial(link="logit"), data=data,
refresh=0)
new <- data.frame(income=5)
predict <- posterior_predict(fit_f, newdata=new)
n <- nrow(nes92)
income_jitt <- nes92$income + runif(n, -.2, .2)
vote_jitt <- nes92$rvote + ifelse(nes92$rvote==0, runif(n, .005, .05), runif(n, -.05, -.005))
par(mar=c(3,3,1,.1), tck=-.01, mgp=c(1.7, .3, 0))
ok <- nes92$presvote<3
vote <- nes92$presvote[ok] - 1
income <- nes92$income[ok]
curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), 1, 5, ylim=c(0,1),
xlim=c(-2,8), xaxt="n", xaxs="i",
ylab="Pr (Republican vote)", xlab="Income", lwd=4, yaxs="i")
curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), -2, 8, lwd=.5, add=TRUE)
axis(1, 1:5)
mtext("(poor)", 1, 1.2, at=1, adj=.5)
mtext("(rich)", 1, 1.2, at=5, adj=.5)
points(income_jitt, vote_jitt, pch=20, cex=.1)
par(mar=c(3,3,1,.1), tck=-.01, mgp=c(1.7, .3, 0))
ok <- nes92$presvote<3
vote <- nes92$presvote[ok] - 1
income <- nes92$income[ok]
curve (invlogit(fit_1$coef[1] + fit_1$coef[2]*x), .5, 5.5, ylim=c(0,1),
xlim=c(.5, 5.5), xaxt="n", xaxs="i",
ylab="Pr (Republican vote)", xlab="Income", yaxs="i")
axis(1, 1:5)
mtext("(poor)", 1, 1.2, at=1, adj=.5)
mtext("(rich)", 1, 1.2, at=5, adj=.5)
sims_1 <- as.matrix(fit_1)
n_sims <- nrow(sims_1)
for (j in sample(n_sims, 20)){
curve(invlogit(sims_1[j,1] +sims_1[j,2]*x), .5, 5.5, lwd=.5, col="gray", add=TRUE)
}
curve(invlogit(fit_1$coef[1] + fit_1$coef[2]*x), .5, 5.5, add=TRUE)
points(income_jitt, vote_jitt, pch=20, cex=.1)
yrs <- seq(1952, 2000, 4)
n_yrs <- length(yrs)
fits <- array(NA, c(n_yrs, 3), dimnames <- list(yrs, c("year", "coef", "se")))
for (j in 1:n_yrs){
yr <- yrs[j]
ok <- (nes$year==yr & !is.na(nes$presvote) & nes$presvote<3 &
!is.na(nes$vote) & !is.na(nes$income))
vote <- nes$presvote[ok] - 1
income <- nes$income[ok]
fit_y <- stan_glm(vote ~ income, family=binomial(link="logit"),
data = data.frame(vote, income),
warmup = 500, iter = 1500, refresh = 0,
save_warmup = FALSE, cores = 1, open_progress = FALSE)
fits[j,] <- c(yr, coef(fit_y)[2], se(fit_y)[2])
}
par(mar=c(3,2.5,1,.2), tck=-.01, mgp=c(1.5, .3, 0))
plot (fits[,"year"], fits[,"coef"], xlim=c(1950,2000), ylim=range(fits[,"coef"]-fits[,"se"], fits[,"coef"]+fits[,"se"]),
pch=20, ylab="Coefficient of income", xlab="Year", bty="l")
for (j in 1:n_yrs){
lines(rep(fits[j,"year"], 2), fits[j,"coef"] + fits[j,"se"]*c(-1,1), lwd=.5)
}
abline(0,0,lwd=.5, lty=2)
predp <- fitted(fit_1)
round(c(mean(predp[nes92$rvote==1]), mean(1-predp[nes92$rvote==0])), 3)
[1] 0.422 0.607
Estimate the predictive performance of a model using within-sample log-score
round(sum(log(c(predp[nes92$rvote==1], 1-predp[nes92$rvote==0]))), 1)
[1] -778.5
Estimate the predictive performance of a model using leave-one-out log-score (elpd_loo)
loo(fit_1)
Computed from 4000 by 1179 log-likelihood matrix
Estimate SE
elpd_loo -780.5 8.6
p_loo 2.1 0.1
looic 1561.0 17.1
------
Monte Carlo SE of elpd_loo is 0.0.
All Pareto k estimates are good (k < 0.5).
See help('pareto-k-diagnostic') for details.
Use "black" as a predictor (nonidentifiability in 1964)
fits_2 <- array(NA, c(n_yrs, 4, 2, 2), dimnames <- list(yrs, c("Intercept", "female", "black", "income"), c("coef", "se"), c("glm", "bayes")))
ok <- (!is.na(nes$rvote) & !is.na(nes$female) & !is.na(nes$black) & !is.na(nes$income))
for (j in 1:n_yrs){
print(yrs[j])
fit_glm <- glm(rvote ~ female + black + income, subset=(year==yrs[j]),
family=binomial(link="logit"), data=nes[ok,])
fits_2[j,,1,1] <- coef(fit_glm)
fits_2[j,,2,1] <- se.coef(fit_glm)
fit_bayes <- stan_glm(rvote ~ female + black + income, subset=(year==yrs[j]),
family=binomial(link="logit"), data=nes[ok,],
warmup = 500, iter = 1500, refresh = 0,
save_warmup = FALSE, cores = 1, open_progress = FALSE)
fits_2[j,,1,2] <- coef(fit_bayes)
fits_2[j,,2,2] <- se(fit_bayes)
display(fit_glm)
print(fit_bayes)
}
par(mfrow=c(2,5), mar=c(3,3,0,1), tck=-.02, mgp=c(1.2,.3,0), oma=c(0,0,3,0))
for (k in 1:2){
plot(0,0,xlab="",ylab="",xaxt="n",yaxt="n",bty="n",type="n")
text(0, 0, if (k==1) "Maximum\nlikelihood\nestimate,\nfrom glm()" else "Bayes estimate\nwith default prior,\nfrom stan_glm()", cex=1.3)
for (l in 1:4){
rng <- range(fits_2[,l,"coef",] - fits_2[,l,"se",], fits_2[,l,"coef",] + fits_2[,l,"se",])
if (l==3) rng <- c(-18, 1)
plot(yrs, fits_2[,l,"coef",k], ylim=rng, ylab="Coefficient", xlab=if (k==2) "Year" else "", bty="l", xaxt="n", pch=20)
axis(1, c(1960, 1980, 2000))
abline(0,0,lwd=.5,lty=2)
for (j in 1:n_yrs){
lines(rep(yrs[j], 2), c(fits_2[j,l,"coef",k] - fits_2[j,l,"se",k], fits_2[j,l,"coef",k] + fits_2[j,l,"se",k]), lwd=.5)
}
if (k==1) mtext(dimnames(fits_2)[[2]][l], 3, 1.5, cex=.8)
}
}