Regression-based imputation for the Social Indicators Survey. See Chapter 17 in Regression and Other Stories.
dplyr + ggplot version.
Load packages
library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("dplyr")
library("ggplot2")
theme_set(bayesplot::theme_default(base_family = "sans"))
Load data
SIS <- read.csv(root("Imputation/data","SIS.csv"))
head(SIS)
earnings retirement interest assistance other male over65 white immig educ_r
1 84.0 0.7 0.20 0 0.0 1 0 1 1 4
2 NA 0.0 0.00 0 0.0 0 0 0 1 4
3 27.5 0.0 0.16 0 0.0 1 0 0 1 2
4 85.0 12.0 5.00 0 NA 1 1 1 0 4
5 135.0 0.0 0.10 0 0.1 1 0 0 0 4
6 0.0 NA NA NA NA 0 0 0 0 4
workmos workhrs_top any_ssi any_welfare any_charity
1 12 40 0 0 0
2 12 40 0 0 0
3 10 40 0 0 0
4 11 8 0 0 0
5 12 40 0 0 0
6 0 40 0 0 0
summary(SIS)
earnings retirement interest assistance
Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.0000
1st Qu.: 4.80 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.0000
Median : 28.00 Median : 0.00 Median : 0.000 Median : 0.0000
Mean : 52.19 Mean : 2.19 Mean : 1.722 Mean : 0.6821
3rd Qu.: 65.00 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.0000
Max. :3250.00 Max. :300.00 Max. :650.000 Max. :36.4000
NA's :241 NA's :123 NA's :277 NA's :128
other male over65 white
Min. : 0.000 Min. :0.0000 Min. :0.00000 Min. :0.0000
1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
Median : 0.000 Median :0.0000 Median :0.00000 Median :0.0000
Mean : 0.577 Mean :0.3618 Mean :0.07728 Mean :0.3185
3rd Qu.: 0.025 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
Max. :96.000 Max. :1.0000 Max. :1.00000 Max. :1.0000
NA's :133
immig educ_r workmos workhrs_top
Min. :0.0000 Min. :1.000 Min. : 0.000 Min. : 0.00
1st Qu.:0.0000 1st Qu.:2.000 1st Qu.: 7.000 1st Qu.:26.00
Median :0.0000 Median :3.000 Median :12.000 Median :40.00
Mean :0.3924 Mean :2.747 Mean : 8.985 Mean :30.19
3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:12.000 3rd Qu.:40.00
Max. :1.0000 Max. :4.000 Max. :12.000 Max. :40.00
any_ssi any_welfare any_charity
Min. :0.00000 Min. :0.00000 Min. :0.000000
1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000
Median :0.00000 Median :0.00000 Median :0.000000
Mean :0.03664 Mean :0.03931 Mean :0.009993
3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000
Max. :1.00000 Max. :1.00000 Max. :1.000000
Imputation helper functions Create a completed data vector using imputations
impute <- function(a, a_impute) {
ifelse(is.na(a), a_impute, a)
}
Top code function
topcode <- function(a, top) {
ifelse(a>top, top, a)
}
se missing codes to NA
na_fix <- function(a) {
ifelse(a<0 | a==999999, NA, a)
}
is_any <- function(a) {
any_a <- ifelse(a>0, 1, 0)
any_a[is.na(a)] <- 0
any_a
}
Impute 0 earnings using the logical rule (if worked 0 months and 0 hrs/wk)
SIS <- mutate(SIS,
zero_earnings = (workhrs_top==0 & workmos==0),
earnings_top = if_else(zero_earnings, 0, topcode(earnings, 100)))
Create a little dataset with all our redefined variables
SIS <- select(SIS,
earnings, earnings_top, interest, male, over65,
white, immig, educ_r, workmos, workhrs_top,
any_ssi, any_welfare, any_charity)
Impute subset of earnings that are nonzero: linear scale
fit_imp_1 <- stan_glm(
earnings ~ male + over65 + white + immig + educ_r +
workmos + workhrs_top + any_ssi +
any_welfare + any_charity,
data = SIS,
subset = earnings > 0,
refresh = 0
)
print(fit_imp_1)
stan_glm
family: gaussian [identity]
formula: earnings ~ male + over65 + white + immig + educ_r + workmos +
workhrs_top + any_ssi + any_welfare + any_charity
observations: 988
predictors: 11
------
Median MAD_SD
(Intercept) -87.0 30.0
male -0.5 9.1
over65 -40.3 39.9
white 26.2 9.9
immig -12.3 9.3
educ_r 22.6 4.3
workmos 5.4 2.1
workhrs_top 0.8 0.6
any_ssi -34.3 38.6
any_welfare -11.6 24.5
any_charity -29.8 40.7
Auxiliary parameter(s):
Median MAD_SD
sigma 132.2 3.1
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
SIS_predictors <- select(SIS, -starts_with("earnings"))
pred_1 <- colMeans(posterior_linpred(fit_imp_1, newdata = SIS_predictors))
SIS <- mutate(SIS,
earnings_imp_1 = impute(earnings, pred_1))
Impute subset of earnings that are nonzero: square root scale and topcoding
fit_imp_2 <- stan_glm(
sqrt(earnings_top) ~ male + over65 + white + immig +
educ_r + workmos + workhrs_top + any_ssi +
any_welfare + any_charity,
data = SIS,
subset = earnings > 0,
refresh = 0
)
print(fit_imp_2)
stan_glm
family: gaussian [identity]
formula: sqrt(earnings_top) ~ male + over65 + white + immig + educ_r +
workmos + workhrs_top + any_ssi + any_welfare + any_charity
observations: 988
predictors: 11
------
Median MAD_SD
(Intercept) -1.7 0.4
male 0.3 0.1
over65 -1.4 0.6
white 1.0 0.1
immig -0.6 0.1
educ_r 0.8 0.1
workmos 0.3 0.0
workhrs_top 0.1 0.0
any_ssi -0.9 0.5
any_welfare -1.3 0.4
any_charity -1.2 0.6
Auxiliary parameter(s):
Median MAD_SD
sigma 2.0 0.0
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
point predictions
pred_2_sqrt <- colMeans(posterior_linpred(fit_imp_2, newdata = SIS_predictors))
pred_2 <- topcode(pred_2_sqrt^2, 100)
SIS <- mutate(SIS,
pred_2 = pred_2,
earnings_imp_2 = impute(earnings_top, pred_2))
Linear scale (use fitted model fit_imp_1)
pred_3 <- posterior_predict(fit_imp_1, newdata = SIS_predictors, draws = 1)
SIS <- mutate(SIS,
earnings_imp_3 = impute(earnings, pred_3))
Square root scale and topcoding (use fitted model fit_imp_2)
pred_4_sqrt <- posterior_predict(fit_imp_2, newdata = SIS_predictors, draws = 1)
pred_4 <- topcode(pred_4_sqrt^2, 100)
SIS <- mutate(SIS,
earnings_imp_4 = impute(earnings_top, pred_4))
Observed earnings (excluding 0's)
qplot(earnings_top, data=filter(SIS, !is.na(earnings)), breaks=seq(0,100,10),
fill=I("white"), col=I("black")) +
xlab("earnings") + ggtitle("Observed earnings (excluding 0's)")
Random imputation of earnings
qplot(earnings_imp_4, data = filter(SIS, !is.na(earnings)),
breaks=seq(0,100,10), fill=I("white"), col=I("black")) +
xlab("earnings") + ggtitle("Random imputation of earnings")
Deterministic imputation scatter plot
ggplot() +
geom_point(aes(x=earnings_imp_2, y=earnings_imp_2), shape=19,
data = filter(SIS, !is.na(earnings))) +
geom_point(aes(x=pred_2, y=earnings_top), shape=20, color="darkgrey",
data = filter(SIS, !is.na(earnings))) +
xlab("Regression prediction") + ylab("Earnings") +
ggtitle("Deterministic imputation")
Random imputation scatter plot
ggplot() +
geom_point(aes(x=earnings_imp_2, y=earnings_imp_4), shape=19,
data = filter(SIS, !is.na(earnings))) +
geom_point(aes(x=pred_2, y=earnings_top), shape=20, color="darkgrey",
data = filter(SIS, !is.na(earnings))) +
xlab("Regression prediction") + ylab("Earnings") +
ggtitle("Random imputation")
Fit the 2 models
fit_positive <- stan_glm((earnings>0) ~ male + over65 + white + immig +
educ_r + any_ssi + any_welfare + any_charity,
data=SIS, family=binomial(link=logit), refresh=0)
print(fit_positive, digits=2)
stan_glm
family: binomial [logit]
formula: (earnings > 0) ~ male + over65 + white + immig + educ_r + any_ssi +
any_welfare + any_charity
observations: 1260
predictors: 9
------
Median MAD_SD
(Intercept) 0.35 0.26
male 0.31 0.18
over65 -4.06 0.34
white -0.10 0.21
immig 0.18 0.18
educ_r 0.56 0.09
any_ssi -2.48 0.38
any_welfare -0.57 0.35
any_charity 0.52 0.88
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
# 2nd model was alraedy fitted before
fit_positive_sqrt <- fit_imp_2
print(fit_positive_sqrt)
stan_glm
family: gaussian [identity]
formula: sqrt(earnings_top) ~ male + over65 + white + immig + educ_r +
workmos + workhrs_top + any_ssi + any_welfare + any_charity
observations: 988
predictors: 11
------
Median MAD_SD
(Intercept) -1.7 0.4
male 0.3 0.1
over65 -1.4 0.6
white 1.0 0.1
immig -0.6 0.1
educ_r 0.8 0.1
workmos 0.3 0.0
workhrs_top 0.1 0.0
any_ssi -0.9 0.5
any_welfare -1.3 0.4
any_charity -1.2 0.6
Auxiliary parameter(s):
Median MAD_SD
sigma 2.0 0.0
------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg
Predict the sign and then the earnings (if positive)
pred_pos <- posterior_predict(fit_positive, newdata = SIS_predictors, draws = 1)
pred_ifpos_sqrt <- posterior_predict(fit_positive_sqrt, newdata = SIS_predictors, draws = 1)
pred_ifpos <- topcode(pred_ifpos_sqrt^2, 100)
SIS <- mutate(SIS,
earnings_imp = impute(earnings, pred_pos*pred_ifpos))
Starting values
random_imp <- function (a){
missing <- is.na(a)
n_missing <- sum(missing)
a_obs <- a[!missing]
imputed <- a
imputed[missing] <- sample(a_obs, n_missing)
imputed
}
SIS <- mutate(SIS,
interest_imp = random_imp(interest),
earnings_imp = random_imp(earnings))
Simplest regression imputation
n_sims <- 10
for (s in 1:n_sims) {
# Predict earnings
output <- capture.output(
fit <- stan_glm(earnings ~ interest_imp + male + over65 + white +
immig + educ_r + workmos + workhrs_top + any_ssi +
any_welfare + any_charity,
data = SIS, cores = 1, open_progress = FALSE, refresh=0))
pred1 <- posterior_predict(fit, draws = 1)
# Impute earnings
SIS <- mutate(SIS,
earnings_imp = impute(earnings, pred1))
# Predict interest
output <- capture.output(
fit <- stan_glm(interest ~ earnings_imp + male + over65 + white +
immig + educ_r + workmos + workhrs_top + any_ssi +
any_welfare + any_charity,
data = SIS, cores = 1, open_progress = FALSE, refresh=0))
pred2 <- posterior_predict(fit, draws = 1)
# Impute interest
SIS <- mutate(SIS,
interest_imp = impute(interest, pred2))
}