Demonstrate poststratification with simulated census and poll data See Chapter 17 in Regression and Other Stories.
library("rstanarm")
invlogit <- plogis
Create an empty poststratification table, with rows for each of the \(2 \times 4 \times 4\) sorts of people.
J <- c(2, 4, 4)
poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1)))
colnames(poststrat) <- c("sex", "age", "eth", "N")
count <- 0
for (i1 in 1:J[1]){
for (i2 in 1:J[2]){
for (i3 in 1:J[3]){
count <- count + 1
poststrat[count, 1:3] <- c(i1, i2, i3)
}
}
}
make up numbers for the populations of the cells
p_sex <- c(0.52, 0.48)
p_age <- c(0.2, 0.25, 0.3, 0.25)
p_eth <- c(0.7, 0.1, 0.1, 0.1)
for (j in 1:prod(J)){
poststrat$N[j] <- 250e6 * p_sex[poststrat[j,1]] * p_age[poststrat[j,2]] *
p_eth[poststrat[j,3]]
}
Hypothesize a nonresponse pattern in which women, older people, and whites are more likely to respond than men, younger people, and minorities
p_response_baseline <- 0.1
p_response_sex <- c(1, 0.8)
p_response_age <- c(1, 1.2, 1.6, 2.5)
p_response_eth <- c(1, 0.8, 0.7, 0.6)
p_response <- rep(NA, prod(J))
for (j in 1:prod(J)){
p_response[j] <- p_response_baseline * p_response_sex[poststrat[j,1]] *
p_response_age[poststrat[j,2]] * p_response_eth[poststrat[j,3]]
}
Sample from the assumed population with the assumed nonresponse probabilities
n <- 1000
people <- sample(prod(J), n, replace=TRUE, prob=poststrat$N*p_response)
# For respondent i, people[i] is that person's poststrat cell,
# some number between 1 and 32
n_cell <- rep(NA, prod(J))
for (j in 1:prod(J)){
n_cell[j] <- sum(people==j)
}
print(cbind(poststrat, n_cell/n, poststrat$N/sum(poststrat$N)))
## sex age eth N n_cell/n poststrat$N/sum(poststrat$N)
## 1 1 1 1 18200000 0.056 0.0728
## 2 1 1 2 2600000 0.004 0.0104
## 3 1 1 3 2600000 0.006 0.0104
## 4 1 1 4 2600000 0.003 0.0104
## 5 1 2 1 22750000 0.074 0.0910
## 6 1 2 2 3250000 0.010 0.0130
## 7 1 2 3 3250000 0.006 0.0130
## 8 1 2 4 3250000 0.008 0.0130
## 9 1 3 1 27300000 0.148 0.1092
## 10 1 3 2 3900000 0.016 0.0156
## 11 1 3 3 3900000 0.013 0.0156
## 12 1 3 4 3900000 0.011 0.0156
## 13 1 4 1 22750000 0.175 0.0910
## 14 1 4 2 3250000 0.016 0.0130
## 15 1 4 3 3250000 0.023 0.0130
## 16 1 4 4 3250000 0.009 0.0130
## 17 2 1 1 16800000 0.035 0.0672
## 18 2 1 2 2400000 0.006 0.0096
## 19 2 1 3 2400000 0.003 0.0096
## 20 2 1 4 2400000 0.002 0.0096
## 21 2 2 1 21000000 0.055 0.0840
## 22 2 2 2 3000000 0.010 0.0120
## 23 2 2 3 3000000 0.009 0.0120
## 24 2 2 4 3000000 0.004 0.0120
## 25 2 3 1 25200000 0.095 0.1008
## 26 2 3 2 3600000 0.011 0.0144
## 27 2 3 3 3600000 0.007 0.0144
## 28 2 3 4 3600000 0.004 0.0144
## 29 2 4 1 21000000 0.137 0.0840
## 30 2 4 2 3000000 0.019 0.0120
## 31 2 4 3 3000000 0.018 0.0120
## 32 2 4 4 3000000 0.007 0.0120
Assume the survey responses come from a logistic regression with these coefficients
coef_intercept <- 0.6
coef_sex <- c(0, -0.2)
coef_age <- c(0, -0.2, -0.3, -0.4)
coef_eth <- c(0, 0.6, 0.3, 0.3)
The probabilities are:
prob_yes <- rep(NA, prod(J))
for (j in 1:prod(J)){
prob_yes[j] <- invlogit(coef_intercept + coef_sex[poststrat[j,1]] +
coef_age[poststrat[j,2]] + coef_eth[poststrat[j,3]])
}
Simulate the fake data:
y <- rbinom(n, 1, prob_yes[people])
sex <- poststrat[people,1]
age <- poststrat[people,2]
eth <- poststrat[people,3]
fake <- data.frame(y, sex, age, eth)
fit <- stan_glm(y ~ factor(sex) + factor(age) + factor(eth),
family=binomial(link="logit"), data=fake, refresh=0)
print(fit)
## stan_glm
## family: binomial [logit]
## formula: y ~ factor(sex) + factor(age) + factor(eth)
## observations: 1000
## predictors: 8
## ------
## Median MAD_SD
## (Intercept) 0.8 0.2
## factor(sex)2 -0.3 0.1
## factor(age)2 -0.1 0.3
## factor(age)3 -0.5 0.2
## factor(age)4 -0.5 0.2
## factor(eth)2 0.8 0.2
## factor(eth)3 0.6 0.3
## factor(eth)4 0.3 0.3
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
pred_sim <- posterior_epred(fit, newdata=as.data.frame(poststrat))
pred_est <- colMeans(pred_sim)
round(cbind(poststrat, prob_yes, pred_est), 2)
## sex age eth N prob_yes pred_est
## 1 1 1 1 18200000 0.65 0.68
## 2 1 1 2 2600000 0.77 0.82
## 3 1 1 3 2600000 0.71 0.80
## 4 1 1 4 2600000 0.71 0.75
## 5 1 2 1 22750000 0.60 0.66
## 6 1 2 2 3250000 0.73 0.80
## 7 1 2 3 3250000 0.67 0.79
## 8 1 2 4 3250000 0.67 0.73
## 9 1 3 1 27300000 0.57 0.58
## 10 1 3 2 3900000 0.71 0.74
## 11 1 3 3 3900000 0.65 0.72
## 12 1 3 4 3900000 0.65 0.66
## 13 1 4 1 22750000 0.55 0.57
## 14 1 4 2 3250000 0.69 0.74
## 15 1 4 3 3250000 0.62 0.71
## 16 1 4 4 3250000 0.62 0.65
## 17 2 1 1 16800000 0.60 0.63
## 18 2 1 2 2400000 0.73 0.78
## 19 2 1 3 2400000 0.67 0.76
## 20 2 1 4 2400000 0.67 0.70
## 21 2 2 1 21000000 0.55 0.61
## 22 2 2 2 3000000 0.69 0.76
## 23 2 2 3 3000000 0.62 0.74
## 24 2 2 4 3000000 0.62 0.68
## 25 2 3 1 25200000 0.52 0.52
## 26 2 3 2 3600000 0.67 0.69
## 27 2 3 3 3600000 0.60 0.67
## 28 2 3 4 3600000 0.60 0.60
## 29 2 4 1 21000000 0.50 0.51
## 30 2 4 2 3000000 0.65 0.69
## 31 2 4 3 3000000 0.57 0.66
## 32 2 4 4 3000000 0.57 0.59
poststrat_est <- sum(poststrat$N*pred_est)/sum(poststrat$N)
round(poststrat_est, 2)
## [1] 0.63
plus uncertainty
poststrat_sim <- pred_sim %*% poststrat$N / sum(poststrat$N)
round(c(mean(poststrat_sim), sd(poststrat_sim)), 3)
## [1] 0.628 0.016