Regression-based imputation for the Social Indicators Survey. See Chapter 17 in Regression and Other Stories.
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
SIS <- read.csv(root("Imputation/data","SIS.csv"))
head(SIS)
earnings retirement interest assistance other male over65 white immig educ_r
1 84.0 0.7 0.20 0 0.0 1 0 1 1 4
2 NA 0.0 0.00 0 0.0 0 0 0 1 4
3 27.5 0.0 0.16 0 0.0 1 0 0 1 2
4 85.0 12.0 5.00 0 NA 1 1 1 0 4
5 135.0 0.0 0.10 0 0.1 1 0 0 0 4
6 0.0 NA NA NA NA 0 0 0 0 4
workmos workhrs_top any_ssi any_welfare any_charity
1 12 40 0 0 0
2 12 40 0 0 0
3 10 40 0 0 0
4 11 8 0 0 0
5 12 40 0 0 0
6 0 40 0 0 0
summary(SIS)
earnings retirement interest assistance
Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.0000
1st Qu.: 4.80 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.0000
Median : 28.00 Median : 0.00 Median : 0.000 Median : 0.0000
Mean : 52.19 Mean : 2.19 Mean : 1.722 Mean : 0.6821
3rd Qu.: 65.00 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.0000
Max. :3250.00 Max. :300.00 Max. :650.000 Max. :36.4000
NA's :241 NA's :123 NA's :277 NA's :128
other male over65 white
Min. : 0.000 Min. :0.0000 Min. :0.00000 Min. :0.0000
1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
Median : 0.000 Median :0.0000 Median :0.00000 Median :0.0000
Mean : 0.577 Mean :0.3618 Mean :0.07728 Mean :0.3185
3rd Qu.: 0.025 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
Max. :96.000 Max. :1.0000 Max. :1.00000 Max. :1.0000
NA's :133
immig educ_r workmos workhrs_top
Min. :0.0000 Min. :1.000 Min. : 0.000 Min. : 0.00
1st Qu.:0.0000 1st Qu.:2.000 1st Qu.: 7.000 1st Qu.:26.00
Median :0.0000 Median :3.000 Median :12.000 Median :40.00
Mean :0.3924 Mean :2.747 Mean : 8.985 Mean :30.19
3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:12.000 3rd Qu.:40.00
Max. :1.0000 Max. :4.000 Max. :12.000 Max. :40.00
any_ssi any_welfare any_charity
Min. :0.00000 Min. :0.00000 Min. :0.000000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000
Median :0.00000 Median :0.00000 Median :0.000000
Mean :0.03664 Mean :0.03931 Mean :0.009993
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000
Max. :1.00000 Max. :1.00000 Max. :1.000000
Create a completed data vector using imputations
impute <- function(a, a_impute) {
ifelse(is.na(a), a_impute, a)
}
Top code function
topcode <- function(a, top) {
ifelse(a > top, top, a)
}
SIS$earnings_top <- topcode(SIS$earnings, 100)
SIS$earnings_top[SIS$workhrs_top==0 & SIS$workmos==0] <- 0
n <- nrow(SIS)
SIS_predictors <- SIS[,c("male","over65","white","immig","educ_r","workmos",
"workhrs_top","any_ssi","any_welfare","any_charity")]
fit_imp_1 <- stan_glm(
earnings ~ male + over65 + white + immig + educ_r +
workmos + workhrs_top + any_ssi +
any_welfare + any_charity,
data = SIS,
subset = earnings > 0,
refresh = 0
)
print(fit_imp_1)
stan_glm
family: gaussian [identity]
formula: earnings ~ male + over65 + white + immig + educ_r + workmos +
workhrs_top + any_ssi + any_welfare + any_charity
observations: 988
predictors: 11
------
Median MAD_SD
(Intercept) -86.3 29.9
male -0.3 9.1
over65 -40.2 38.1
white 25.9 10.2
immig -12.1 9.4
educ_r 22.4 4.5
workmos 5.3 2.0
workhrs_top 0.8 0.6
any_ssi -34.9 37.1
any_welfare -13.0 24.2
any_charity -29.2 40.2
Auxiliary parameter(s):
Median MAD_SD
sigma 132.3 3.0
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
point predictions
pred_1 <- colMeans(posterior_linpred(fit_imp_1, newdata = SIS_predictors))
SIS$earnings_imp_1 <- impute(SIS$earnings, pred_1)
fit_imp_2 <- stan_glm(
sqrt(earnings_top) ~ male + over65 + white + immig +
educ_r + workmos + workhrs_top + any_ssi +
any_welfare + any_charity,
data = SIS,
subset = earnings > 0,
refresh = 0
)
print(fit_imp_2)
stan_glm
family: gaussian [identity]
formula: sqrt(earnings_top) ~ male + over65 + white + immig + educ_r +
workmos + workhrs_top + any_ssi + any_welfare + any_charity
observations: 988
predictors: 11
------
Median MAD_SD
(Intercept) -1.7 0.4
male 0.3 0.1
over65 -1.4 0.6
white 1.0 0.2
immig -0.6 0.1
educ_r 0.8 0.1
workmos 0.3 0.0
workhrs_top 0.1 0.0
any_ssi -1.0 0.6
any_welfare -1.3 0.4
any_charity -1.2 0.6
Auxiliary parameter(s):
Median MAD_SD
sigma 2.0 0.0
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
point predictions
pred_2_sqrt <- colMeans(posterior_linpred(fit_imp_2, newdata = SIS_predictors))
pred_2 <- topcode(pred_2_sqrt^2, 100)
SIS$earnings_imp_2 <- impute(SIS$earnings_top, pred_2)
pred_3 <- posterior_predict(fit_imp_1, newdata = SIS_predictors, draws = 1)
SIS$earnings_imp_3 <- impute(SIS$earnings, pred_3)
pred_4_sqrt <- posterior_predict(fit_imp_2, newdata = SIS_predictors, draws = 1)
pred_4 <- topcode(pred_4_sqrt^2, 100)
SIS$earnings_imp_4 <- impute(SIS$earnings_top, pred_4)
par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
hist(SIS$earnings_top[SIS$earnings>0], breaks=seq(0,100,10), xlab="earnings", ylab="", main="Observed earnings (excluding 0's)")
par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
hist(SIS$earnings_imp_2[is.na(SIS$earnings)], breaks=seq(0,100,10),
xlab="earnings", ylab="", ylim=c(0,48),
main="Deterministic imputation of earnings")
par(mar=c(3,3,1,1), mgp=c(1.7,.5,0), tck=-.01)
hist(SIS$earnings_imp_4[is.na(SIS$earnings)], breaks=seq(0,100,10),
xlab="earnings", ylab="", ylim=c(0,48),
main="Random imputation of earnings")
par(mar=c(3,3,2,1), mgp=c(1.7,.5,0), tck=-.01)
plot(range(SIS$earnings_imp_2[is.na(SIS$earnings)]), c(0,100),
xlab="Regression prediction", ylab="Earnings",
main="Deterministic imputation", type="n", bty="l")
points(SIS$earnings_imp_2[is.na(SIS$earnings)], SIS$earnings_imp_2[is.na(SIS$earnings)], pch=19, cex=.5)
points(pred_2, SIS$earnings_top, pch=20, col="darkgray", cex=.5)
par(mar=c(3,3,2,1), mgp=c(1.7,.5,0), tck=-.01)
plot(range(SIS$earnings_imp_2[is.na(SIS$earnings)]), c(0,100),
xlab="Regression prediction", ylab="Earnings",
main="Random imputation", type="n", bty="l")
points(SIS$earnings_imp_2[is.na(SIS$earnings)], SIS$earnings_imp_4[is.na(SIS$earnings)], pch=19, cex=.5)
points(pred_2, SIS$earnings_top, pch=20, col="darkgray", cex=.5)
fit_positive <- stan_glm((earnings>0) ~ male + over65 + white + immig +
educ_r + any_ssi + any_welfare + any_charity,
data=SIS, family=binomial(link=logit), refresh = 0)
print(fit_positive)
stan_glm
family: binomial [logit]
formula: (earnings > 0) ~ male + over65 + white + immig + educ_r + any_ssi +
any_welfare + any_charity
observations: 1260
predictors: 9
------
Median MAD_SD
(Intercept) 0.4 0.3
male 0.3 0.2
over65 -4.0 0.3
white -0.1 0.2
immig 0.2 0.2
educ_r 0.6 0.1
any_ssi -2.5 0.4
any_welfare -0.6 0.3
any_charity 0.5 0.8
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
fit_positive_sqrt <- stan_glm(sqrt(earnings_top) ~ male + over65 + white + immig +
educ_r + any_ssi + any_welfare + any_charity,
data=SIS, subset=earnings>0, refresh = 0) # (same as fit_imp_2 from above)
print(fit_positive_sqrt)
stan_glm
family: gaussian [identity]
formula: sqrt(earnings_top) ~ male + over65 + white + immig + educ_r +
any_ssi + any_welfare + any_charity
observations: 988
predictors: 9
------
Median MAD_SD
(Intercept) 3.7 0.2
male 0.4 0.1
over65 -2.1 0.6
white 1.0 0.2
immig -0.6 0.1
educ_r 0.9 0.1
any_ssi -1.2 0.6
any_welfare -2.8 0.4
any_charity -1.7 0.7
Auxiliary parameter(s):
Median MAD_SD
sigma 2.2 0.0
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
# one random imp
pred_sign <- posterior_predict(fit_positive, newdata = SIS_predictors, draws = 1)
# one random imp
pred_pos_sqrt <- posterior_predict(fit_positive_sqrt, newdata = SIS_predictors,
draws = 1)
pred_pos <- topcode(pred_pos_sqrt^2, 100)
SIS$earnings_imp <- impute(SIS$earnings, pred_sign*pred_pos)
random_imp <- function (a){
missing <- is.na(a)
n_missing <- sum(missing)
a_obs <- a[!missing]
imputed <- a
imputed[missing] <- sample(a_obs, n_missing)
imputed
}
SIS$interest_imp <- random_imp(SIS$interest)
SIS$earnings_imp <- random_imp(SIS$earnings)
n_loop <- 10
for (s in 1:n_loop){
fit <- stan_glm(earnings ~ interest_imp + male + over65 + white +
immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare +
any_charity, data=SIS, refresh = 0)
SIS_predictors <- SIS[,c("male","over65","white","immig","educ_r","workmos",
"workhrs_top","any_ssi","any_welfare","any_charity",
"interest_imp", "earnings_imp")]
pred1 <- posterior_predict(fit, newdata = SIS_predictors, draws = 1)
SIS$earnings_imp <- impute(SIS$earnings, pred1)
fit <- stan_glm(interest ~ earnings_imp + male + over65 + white +
immig + educ_r + workmos + workhrs_top + any_ssi + any_welfare +
any_charity, data=SIS, refresh = 0)
SIS_predictors <- SIS[,c("male","over65","white","immig","educ_r","workmos",
"workhrs_top","any_ssi","any_welfare","any_charity",
"interest_imp", "earnings_imp")]
pred2 <- posterior_predict(fit, newdata = SIS_predictors, draws = 1)
SIS$interest_imp <- impute(SIS$interest, pred2)
}