Demonstrate poststratification with simulated census and poll data See Chapter 17 in Regression and Other Stories.


Load packages

library("rstanarm")
invlogit <- plogis

Simulate fake data

Create an empty poststratification table, with rows for each of the \(2 \times 4 \times 4\) sorts of people.

J <- c(2, 4, 4)
poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1)))
colnames(poststrat) <- c("sex", "age", "eth", "N")
count <- 0
for (i1 in 1:J[1]){
  for (i2 in 1:J[2]){
    for (i3 in 1:J[3]){
      count <- count + 1
      poststrat[count, 1:3] <- c(i1, i2, i3)
    }
  }
}

make up numbers for the populations of the cells

p_sex <- c(0.52, 0.48)
p_age <- c(0.2, 0.25, 0.3, 0.25)
p_eth <- c(0.7, 0.1, 0.1, 0.1)
for (j in 1:prod(J)){
  poststrat$N[j] <- 250e6 * p_sex[poststrat[j,1]] * p_age[poststrat[j,2]] *
  p_eth[poststrat[j,3]]
}

Hypothesize a nonresponse pattern in which women, older people, and whites are more likely to respond than men, younger people, and minorities

p_response_baseline <- 0.1
p_response_sex <- c(1, 0.8)
p_response_age <- c(1, 1.2, 1.6, 2.5)
p_response_eth <- c(1, 0.8, 0.7, 0.6)
p_response <- rep(NA, prod(J))
for (j in 1:prod(J)){
  p_response[j] <- p_response_baseline * p_response_sex[poststrat[j,1]] *
    p_response_age[poststrat[j,2]] * p_response_eth[poststrat[j,3]]
}

Sample from the assumed population with the assumed nonresponse probabilities

n <- 1000
people <- sample(prod(J), n, replace=TRUE, prob=poststrat$N*p_response)
# For respondent i, people[i] is that person's poststrat cell,
# some number between 1 and 32
n_cell <- rep(NA, prod(J))
for (j in 1:prod(J)){
  n_cell[j] <- sum(people==j)
}
print(cbind(poststrat, n_cell/n, poststrat$N/sum(poststrat$N)))
##    sex age eth        N n_cell/n poststrat$N/sum(poststrat$N)
## 1    1   1   1 18200000    0.056                       0.0728
## 2    1   1   2  2600000    0.004                       0.0104
## 3    1   1   3  2600000    0.006                       0.0104
## 4    1   1   4  2600000    0.003                       0.0104
## 5    1   2   1 22750000    0.074                       0.0910
## 6    1   2   2  3250000    0.010                       0.0130
## 7    1   2   3  3250000    0.006                       0.0130
## 8    1   2   4  3250000    0.008                       0.0130
## 9    1   3   1 27300000    0.148                       0.1092
## 10   1   3   2  3900000    0.016                       0.0156
## 11   1   3   3  3900000    0.013                       0.0156
## 12   1   3   4  3900000    0.011                       0.0156
## 13   1   4   1 22750000    0.175                       0.0910
## 14   1   4   2  3250000    0.016                       0.0130
## 15   1   4   3  3250000    0.023                       0.0130
## 16   1   4   4  3250000    0.009                       0.0130
## 17   2   1   1 16800000    0.035                       0.0672
## 18   2   1   2  2400000    0.006                       0.0096
## 19   2   1   3  2400000    0.003                       0.0096
## 20   2   1   4  2400000    0.002                       0.0096
## 21   2   2   1 21000000    0.055                       0.0840
## 22   2   2   2  3000000    0.010                       0.0120
## 23   2   2   3  3000000    0.009                       0.0120
## 24   2   2   4  3000000    0.004                       0.0120
## 25   2   3   1 25200000    0.095                       0.1008
## 26   2   3   2  3600000    0.011                       0.0144
## 27   2   3   3  3600000    0.007                       0.0144
## 28   2   3   4  3600000    0.004                       0.0144
## 29   2   4   1 21000000    0.137                       0.0840
## 30   2   4   2  3000000    0.019                       0.0120
## 31   2   4   3  3000000    0.018                       0.0120
## 32   2   4   4  3000000    0.007                       0.0120

Assume the survey responses come from a logistic regression with these coefficients

coef_intercept <- 0.6
coef_sex <- c(0, -0.2)
coef_age <- c(0, -0.2, -0.3, -0.4)
coef_eth <- c(0, 0.6, 0.3, 0.3)

The probabilities are:

prob_yes <- rep(NA, prod(J))
for (j in 1:prod(J)){
  prob_yes[j] <- invlogit(coef_intercept + coef_sex[poststrat[j,1]] +
  coef_age[poststrat[j,2]] + coef_eth[poststrat[j,3]])
}

Simulate the fake data:

y <- rbinom(n, 1, prob_yes[people])

Linear model

sex <- poststrat[people,1]
age <- poststrat[people,2]
eth <- poststrat[people,3]
fake <- data.frame(y, sex, age, eth)
fit <- stan_glm(y ~ factor(sex) + factor(age) + factor(eth),
                family=binomial(link="logit"), data=fake, refresh=0)
print(fit)
## stan_glm
##  family:       binomial [logit]
##  formula:      y ~ factor(sex) + factor(age) + factor(eth)
##  observations: 1000
##  predictors:   8
## ------
##              Median MAD_SD
## (Intercept)   0.8    0.2  
## factor(sex)2 -0.3    0.1  
## factor(age)2 -0.1    0.3  
## factor(age)3 -0.5    0.2  
## factor(age)4 -0.5    0.2  
## factor(eth)2  0.8    0.2  
## factor(eth)3  0.6    0.3  
## factor(eth)4  0.3    0.3  
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Prediction

pred_sim <- posterior_epred(fit, newdata=as.data.frame(poststrat))
pred_est <- colMeans(pred_sim)
round(cbind(poststrat, prob_yes, pred_est), 2)
##    sex age eth        N prob_yes pred_est
## 1    1   1   1 18200000     0.65     0.68
## 2    1   1   2  2600000     0.77     0.82
## 3    1   1   3  2600000     0.71     0.80
## 4    1   1   4  2600000     0.71     0.75
## 5    1   2   1 22750000     0.60     0.66
## 6    1   2   2  3250000     0.73     0.80
## 7    1   2   3  3250000     0.67     0.79
## 8    1   2   4  3250000     0.67     0.73
## 9    1   3   1 27300000     0.57     0.58
## 10   1   3   2  3900000     0.71     0.74
## 11   1   3   3  3900000     0.65     0.72
## 12   1   3   4  3900000     0.65     0.66
## 13   1   4   1 22750000     0.55     0.57
## 14   1   4   2  3250000     0.69     0.74
## 15   1   4   3  3250000     0.62     0.71
## 16   1   4   4  3250000     0.62     0.65
## 17   2   1   1 16800000     0.60     0.63
## 18   2   1   2  2400000     0.73     0.78
## 19   2   1   3  2400000     0.67     0.76
## 20   2   1   4  2400000     0.67     0.70
## 21   2   2   1 21000000     0.55     0.61
## 22   2   2   2  3000000     0.69     0.76
## 23   2   2   3  3000000     0.62     0.74
## 24   2   2   4  3000000     0.62     0.68
## 25   2   3   1 25200000     0.52     0.52
## 26   2   3   2  3600000     0.67     0.69
## 27   2   3   3  3600000     0.60     0.67
## 28   2   3   4  3600000     0.60     0.60
## 29   2   4   1 21000000     0.50     0.51
## 30   2   4   2  3000000     0.65     0.69
## 31   2   4   3  3000000     0.57     0.66
## 32   2   4   4  3000000     0.57     0.59

Poststratification

poststrat_est <- sum(poststrat$N*pred_est)/sum(poststrat$N)
round(poststrat_est, 2)
## [1] 0.63

plus uncertainty

poststrat_sim <- pred_sim %*% poststrat$N / sum(poststrat$N)
round(c(mean(poststrat_sim), sd(poststrat_sim)), 3)
## [1] 0.628 0.016
LS0tCnRpdGxlOiAiUmVncmVzc2lvbiBhbmQgT3RoZXIgU3RvcmllczogUG9zdHN0cmF0aWZpY2F0aW9uIDIiCmF1dGhvcjogIkFuZHJldyBHZWxtYW4sIEplbm5pZmVyIEhpbGwsIEFraSBWZWh0YXJpIgpkYXRlOiAiYHIgZm9ybWF0KFN5cy5EYXRlKCkpYCIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICB0aGVtZTogcmVhZGFibGUKICAgIHRvYzogdHJ1ZQogICAgdG9jX2RlcHRoOiAyCiAgICB0b2NfZmxvYXQ6IHRydWUKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKLS0tCkRlbW9uc3RyYXRlIHBvc3RzdHJhdGlmaWNhdGlvbiB3aXRoIHNpbXVsYXRlZCBjZW5zdXMgYW5kIHBvbGwgZGF0YQpTZWUgQ2hhcHRlciAxNyBpbiBSZWdyZXNzaW9uIGFuZCBPdGhlciBTdG9yaWVzLgoKCi0tLS0tLS0tLS0tLS0KCiMjIyMgTG9hZCBwYWNrYWdlcwoKYGBge3Igc2V0dXAsIG1lc3NhZ2U9RkFMU0UsIGVycm9yPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQpsaWJyYXJ5KCJyc3RhbmFybSIpCmludmxvZ2l0IDwtIHBsb2dpcwpgYGAKCiMjIFNpbXVsYXRlIGZha2UgZGF0YQoKQ3JlYXRlIGFuIGVtcHR5IHBvc3RzdHJhdGlmaWNhdGlvbiB0YWJsZSwgd2l0aCByb3dzIGZvciBlYWNoIG9mIHRoZQokMiBcdGltZXMgNCBcdGltZXMgNCQgc29ydHMgb2YgcGVvcGxlLgoKYGBge3IgfQpKIDwtIGMoMiwgNCwgNCkKcG9zdHN0cmF0IDwtIGFzLmRhdGEuZnJhbWUoYXJyYXkoTkEsIGMocHJvZChKKSwgbGVuZ3RoKEopKzEpKSkKY29sbmFtZXMocG9zdHN0cmF0KSA8LSBjKCJzZXgiLCAiYWdlIiwgImV0aCIsICJOIikKY291bnQgPC0gMApmb3IgKGkxIGluIDE6SlsxXSl7CiAgZm9yIChpMiBpbiAxOkpbMl0pewogICAgZm9yIChpMyBpbiAxOkpbM10pewogICAgICBjb3VudCA8LSBjb3VudCArIDEKICAgICAgcG9zdHN0cmF0W2NvdW50LCAxOjNdIDwtIGMoaTEsIGkyLCBpMykKICAgIH0KICB9Cn0KYGBgCgptYWtlIHVwIG51bWJlcnMgZm9yIHRoZSBwb3B1bGF0aW9ucyBvZiB0aGUgY2VsbHMKCmBgYHtyIH0KcF9zZXggPC0gYygwLjUyLCAwLjQ4KQpwX2FnZSA8LSBjKDAuMiwgMC4yNSwgMC4zLCAwLjI1KQpwX2V0aCA8LSBjKDAuNywgMC4xLCAwLjEsIDAuMSkKZm9yIChqIGluIDE6cHJvZChKKSl7CiAgcG9zdHN0cmF0JE5bal0gPC0gMjUwZTYgKiBwX3NleFtwb3N0c3RyYXRbaiwxXV0gKiBwX2FnZVtwb3N0c3RyYXRbaiwyXV0gKgogIHBfZXRoW3Bvc3RzdHJhdFtqLDNdXQp9CmBgYAoKSHlwb3RoZXNpemUgYSBub25yZXNwb25zZSBwYXR0ZXJuIGluIHdoaWNoIHdvbWVuLCBvbGRlciBwZW9wbGUsIGFuZAp3aGl0ZXMgYXJlIG1vcmUgbGlrZWx5IHRvIHJlc3BvbmQgdGhhbiBtZW4sIHlvdW5nZXIgcGVvcGxlLCBhbmQKbWlub3JpdGllcwoKYGBge3IgfQpwX3Jlc3BvbnNlX2Jhc2VsaW5lIDwtIDAuMQpwX3Jlc3BvbnNlX3NleCA8LSBjKDEsIDAuOCkKcF9yZXNwb25zZV9hZ2UgPC0gYygxLCAxLjIsIDEuNiwgMi41KQpwX3Jlc3BvbnNlX2V0aCA8LSBjKDEsIDAuOCwgMC43LCAwLjYpCnBfcmVzcG9uc2UgPC0gcmVwKE5BLCBwcm9kKEopKQpmb3IgKGogaW4gMTpwcm9kKEopKXsKICBwX3Jlc3BvbnNlW2pdIDwtIHBfcmVzcG9uc2VfYmFzZWxpbmUgKiBwX3Jlc3BvbnNlX3NleFtwb3N0c3RyYXRbaiwxXV0gKgogICAgcF9yZXNwb25zZV9hZ2VbcG9zdHN0cmF0W2osMl1dICogcF9yZXNwb25zZV9ldGhbcG9zdHN0cmF0W2osM11dCn0KYGBgCgpTYW1wbGUgZnJvbSB0aGUgYXNzdW1lZCBwb3B1bGF0aW9uIHdpdGggdGhlIGFzc3VtZWQgbm9ucmVzcG9uc2UgcHJvYmFiaWxpdGllcwoKYGBge3IgfQpuIDwtIDEwMDAKcGVvcGxlIDwtIHNhbXBsZShwcm9kKEopLCBuLCByZXBsYWNlPVRSVUUsIHByb2I9cG9zdHN0cmF0JE4qcF9yZXNwb25zZSkKIyBGb3IgcmVzcG9uZGVudCBpLCBwZW9wbGVbaV0gaXMgdGhhdCBwZXJzb24ncyBwb3N0c3RyYXQgY2VsbCwKIyBzb21lIG51bWJlciBiZXR3ZWVuIDEgYW5kIDMyCm5fY2VsbCA8LSByZXAoTkEsIHByb2QoSikpCmZvciAoaiBpbiAxOnByb2QoSikpewogIG5fY2VsbFtqXSA8LSBzdW0ocGVvcGxlPT1qKQp9CnByaW50KGNiaW5kKHBvc3RzdHJhdCwgbl9jZWxsL24sIHBvc3RzdHJhdCROL3N1bShwb3N0c3RyYXQkTikpKQpgYGAKCkFzc3VtZSB0aGUgc3VydmV5IHJlc3BvbnNlcyBjb21lIGZyb20gYSBsb2dpc3RpYyByZWdyZXNzaW9uIHdpdGgKdGhlc2UgY29lZmZpY2llbnRzCgpgYGB7ciB9CmNvZWZfaW50ZXJjZXB0IDwtIDAuNgpjb2VmX3NleCA8LSBjKDAsIC0wLjIpCmNvZWZfYWdlIDwtIGMoMCwgLTAuMiwgLTAuMywgLTAuNCkKY29lZl9ldGggPC0gYygwLCAwLjYsIDAuMywgMC4zKQpgYGAKClRoZSBwcm9iYWJpbGl0aWVzIGFyZToKCmBgYHtyIH0KcHJvYl95ZXMgPC0gcmVwKE5BLCBwcm9kKEopKQpmb3IgKGogaW4gMTpwcm9kKEopKXsKICBwcm9iX3llc1tqXSA8LSBpbnZsb2dpdChjb2VmX2ludGVyY2VwdCArIGNvZWZfc2V4W3Bvc3RzdHJhdFtqLDFdXSArCiAgY29lZl9hZ2VbcG9zdHN0cmF0W2osMl1dICsgY29lZl9ldGhbcG9zdHN0cmF0W2osM11dKQp9CmBgYAoKU2ltdWxhdGUgdGhlIGZha2UgZGF0YToKCmBgYHtyIH0KeSA8LSByYmlub20obiwgMSwgcHJvYl95ZXNbcGVvcGxlXSkKYGBgCgojIyBMaW5lYXIgbW9kZWwKCmBgYHtyIH0Kc2V4IDwtIHBvc3RzdHJhdFtwZW9wbGUsMV0KYWdlIDwtIHBvc3RzdHJhdFtwZW9wbGUsMl0KZXRoIDwtIHBvc3RzdHJhdFtwZW9wbGUsM10KZmFrZSA8LSBkYXRhLmZyYW1lKHksIHNleCwgYWdlLCBldGgpCmZpdCA8LSBzdGFuX2dsbSh5IH4gZmFjdG9yKHNleCkgKyBmYWN0b3IoYWdlKSArIGZhY3RvcihldGgpLAogICAgICAgICAgICAgICAgZmFtaWx5PWJpbm9taWFsKGxpbms9ImxvZ2l0IiksIGRhdGE9ZmFrZSwgcmVmcmVzaD0wKQpwcmludChmaXQpCmBgYAoKIyMjIyBQcmVkaWN0aW9uCgpgYGB7ciB9CnByZWRfc2ltIDwtIHBvc3Rlcmlvcl9lcHJlZChmaXQsIG5ld2RhdGE9YXMuZGF0YS5mcmFtZShwb3N0c3RyYXQpKQpwcmVkX2VzdCA8LSBjb2xNZWFucyhwcmVkX3NpbSkKcm91bmQoY2JpbmQocG9zdHN0cmF0LCBwcm9iX3llcywgcHJlZF9lc3QpLCAyKQpgYGAKCiMjIyMgUG9zdHN0cmF0aWZpY2F0aW9uCgpgYGB7ciB9CnBvc3RzdHJhdF9lc3QgPC0gc3VtKHBvc3RzdHJhdCROKnByZWRfZXN0KS9zdW0ocG9zdHN0cmF0JE4pCnJvdW5kKHBvc3RzdHJhdF9lc3QsIDIpCmBgYAoKcGx1cyB1bmNlcnRhaW50eQoKYGBge3IgfQpwb3N0c3RyYXRfc2ltIDwtIHByZWRfc2ltICUqJSBwb3N0c3RyYXQkTiAvIHN1bShwb3N0c3RyYXQkTikKcm91bmQoYyhtZWFuKHBvc3RzdHJhdF9zaW0pLCBzZChwb3N0c3RyYXRfc2ltKSksIDMpCmBgYAoK