Predict respondents' yearly earnings using survey data from 1990. See Chapter 15 in Regression and Other Stories.


Load packages

library("rprojroot")
root<-has_file(".ROS-Examples-root")$make_fix_file()
library("rstanarm")
library("ggplot2")
library("bayesplot")
theme_set(bayesplot::theme_default(base_family = "sans"))

Load data

earnings <- read.csv(root("Earnings/data","earnings.csv"))
head(earnings)
  height weight male  earn earnk ethnicity education mother_education
1     74    210    1 50000    50     White        16               16
2     66    125    0 60000    60     White        16               16
3     64    126    0 30000    30     White        16               16
4     65    200    0 25000    25     White        17               17
5     63    110    0 50000    50     Other        16               16
6     68    165    0 62000    62     Black        18               18
  father_education walk exercise smokenow tense angry age
1               16    3        3        2     0     0  45
2               16    6        5        1     0     0  58
3               16    8        1        2     1     1  29
4               NA    8        1        2     0     0  57
5               16    5        6        2     0     0  91
6               18    1        1        2     2     2  54

Compound discrete-continuos model

Logistic regression on non-zero earnings

fit_2a <- stan_glm((earn>0) ~ height + male,
                   family = binomial(link = "logit"),
                   data = earnings, refresh = 0)
sims_2a <- as.matrix(fit_2a)
print(fit_2a, digits=2)
stan_glm
 family:       binomial [logit]
 formula:      (earn > 0) ~ height + male
 observations: 1816
 predictors:   3
------
            Median MAD_SD
(Intercept) -2.95   1.92 
height       0.07   0.03 
male         1.67   0.32 

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg

Linear regression on log scale

fit_2b <- stan_glm(log(earn) ~ height + male,
                   data = earnings, subset = earn>0,
                   refresh = 0)
sims_2b <- as.matrix(fit_2b)
print(fit_2b, digits=2)
stan_glm
 family:       gaussian [identity]
 formula:      log(earn) ~ height + male
 observations: 1629
 predictors:   3
------
            Median MAD_SD
(Intercept) 7.97   0.51  
height      0.02   0.01  
male        0.37   0.06  

Auxiliary parameter(s):
      Median MAD_SD
sigma 0.87   0.02  

------
* For help interpreting the printed output see ?print.stanreg
* For info on the priors used see ?prior_summary.stanreg

Predictions for a new person

new <- data.frame(height = 68, male = 0, earnings$earn>0)
pred_2a <- posterior_predict(fit_2a, newdata=new)
pred_2b <- posterior_predict(fit_2b, newdata=new)
pred <- ifelse(pred_2a == 1, exp(pred_2b), 0)
LS0tCnRpdGxlOiAiUmVncmVzc2lvbiBhbmQgT3RoZXIgU3RvcmllczogRWFybmluZ3MiCmF1dGhvcjogIkFuZHJldyBHZWxtYW4sIEplbm5pZmVyIEhpbGwsIEFraSBWZWh0YXJpIgpkYXRlOiAiYHIgZm9ybWF0KFN5cy5EYXRlKCkpYCIKb3V0cHV0OgogIGh0bWxfZG9jdW1lbnQ6CiAgICB0aGVtZTogcmVhZGFibGUKICAgIHRvYzogdHJ1ZQogICAgdG9jX2RlcHRoOiAyCiAgICB0b2NfZmxvYXQ6IHRydWUKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKLS0tClByZWRpY3QgcmVzcG9uZGVudHMnIHllYXJseSBlYXJuaW5ncyB1c2luZyBzdXJ2ZXkgZGF0YSBmcm9tCjE5OTAuIFNlZSBDaGFwdGVyIDE1IGluIFJlZ3Jlc3Npb24gYW5kIE90aGVyIFN0b3JpZXMuCgotLS0tLS0tLS0tLS0tCgoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChtZXNzYWdlPUZBTFNFLCBlcnJvcj1GQUxTRSwgd2FybmluZz1GQUxTRSwgY29tbWVudD1OQSkKYGBgCgojIyMjIExvYWQgcGFja2FnZXMKCmBgYHtyIH0KbGlicmFyeSgicnByb2pyb290IikKcm9vdDwtaGFzX2ZpbGUoIi5ST1MtRXhhbXBsZXMtcm9vdCIpJG1ha2VfZml4X2ZpbGUoKQpsaWJyYXJ5KCJyc3RhbmFybSIpCmxpYnJhcnkoImdncGxvdDIiKQpsaWJyYXJ5KCJiYXllc3Bsb3QiKQp0aGVtZV9zZXQoYmF5ZXNwbG90Ojp0aGVtZV9kZWZhdWx0KGJhc2VfZmFtaWx5ID0gInNhbnMiKSkKYGBgCgojIyMjIExvYWQgZGF0YQoKYGBge3IgfQplYXJuaW5ncyA8LSByZWFkLmNzdihyb290KCJFYXJuaW5ncy9kYXRhIiwiZWFybmluZ3MuY3N2IikpCmhlYWQoZWFybmluZ3MpCmBgYAoKIyMgQ29tcG91bmQgZGlzY3JldGUtY29udGludW9zIG1vZGVsCiMjIyMgTG9naXN0aWMgcmVncmVzc2lvbiBvbiBub24temVybyBlYXJuaW5ncwoKYGBge3IgfQpmaXRfMmEgPC0gc3Rhbl9nbG0oKGVhcm4+MCkgfiBoZWlnaHQgKyBtYWxlLAogICAgICAgICAgICAgICAgICAgZmFtaWx5ID0gYmlub21pYWwobGluayA9ICJsb2dpdCIpLAogICAgICAgICAgICAgICAgICAgZGF0YSA9IGVhcm5pbmdzLCByZWZyZXNoID0gMCkKc2ltc18yYSA8LSBhcy5tYXRyaXgoZml0XzJhKQpwcmludChmaXRfMmEsIGRpZ2l0cz0yKQpgYGAKCiMjIyMgTGluZWFyIHJlZ3Jlc3Npb24gb24gbG9nIHNjYWxlCgpgYGB7ciB9CmZpdF8yYiA8LSBzdGFuX2dsbShsb2coZWFybikgfiBoZWlnaHQgKyBtYWxlLAogICAgICAgICAgICAgICAgICAgZGF0YSA9IGVhcm5pbmdzLCBzdWJzZXQgPSBlYXJuPjAsCiAgICAgICAgICAgICAgICAgICByZWZyZXNoID0gMCkKc2ltc18yYiA8LSBhcy5tYXRyaXgoZml0XzJiKQpwcmludChmaXRfMmIsIGRpZ2l0cz0yKQpgYGAKCiMjIyMgUHJlZGljdGlvbnMgZm9yIGEgbmV3IHBlcnNvbgoKYGBge3IgfQpuZXcgPC0gZGF0YS5mcmFtZShoZWlnaHQgPSA2OCwgbWFsZSA9IDAsIGVhcm5pbmdzJGVhcm4+MCkKcHJlZF8yYSA8LSBwb3N0ZXJpb3JfcHJlZGljdChmaXRfMmEsIG5ld2RhdGE9bmV3KQpwcmVkXzJiIDwtIHBvc3Rlcmlvcl9wcmVkaWN0KGZpdF8yYiwgbmV3ZGF0YT1uZXcpCnByZWQgPC0gaWZlbHNlKHByZWRfMmEgPT0gMSwgZXhwKHByZWRfMmIpLCAwKQpgYGAKCg==