Demonstrate how the computation time scales with bigger data. See Chapter 22 in Regression and Other Stories.


Load packages

library(arm)
library(rstanarm)
options(mc.cores = parallel::detectCores())
library(tictoc)

Linear regression n=100K, p=100

Create fake data with n=100,000 and p=100

SEED <- 1656
set.seed(SEED)
n <- 1e5
k <- 100
x <- rnorm(n)
xn <- matrix(rnorm(n*(k-1)),nrow=n)
a <- 2
b <- 3
sigma <- 10
y <- a + b*x + sigma*rnorm(n)
fake <- data.frame(x, xn, y)

Fit using lm

tic()
fit1 <- lm(y ~ ., data=fake)
toc()
1.722 sec elapsed
#display(fit1)

Fit using stan_glm and MCMC

stan_glm is fast for linear regression with n>k and small or moderate k (using OLS trick)

tic()
fit2 <- stan_glm(y ~ ., data=fake, mean_PPD=FALSE,
                 refresh=500, seed=SEED, cores=1)
toc()
15.704 sec elapsed
#print(fit2, digits=2)

Fit using stan_glm and optimization

Using optimization with normal approximation and Pareto smoothed importance resampling provides coefficient standard errors, but also diagnostic whether normal approximation at the mode is appropriate.

tic()
fit3 <- stan_glm(y ~ ., data=fake, algorithm='optimizing', init=0) 
toc()
10.287 sec elapsed
#print(fit3, digits=2)

Logistic regression n=10K, p=100

Create fake data with 10,000 observations and p=100

SEED <- 1655
set.seed(SEED)
n <- 1e4
k <- 100
x <- rnorm(n)
xn <- matrix(rnorm(n*(k-1)),nrow=n)
a <- 2
b <- 3
sigma <- 1
y <- as.numeric(a + b*x + sigma*rnorm(n) > 0)
fake <- data.frame(x, xn, y)

Fit using glm

tic()
fit1 <- glm(y ~ ., data=fake, family=binomial())
toc()
1.118 sec elapsed
#display(fit1)

Fit using stan_glm and MCMC

tic()
fit2 <- stan_glm(y ~ ., data=fake, family=binomial(), mean_PPD=FALSE,
                 init_r=0.1, seed=SEED)
toc()
117.87 sec elapsed
#print(fit2, digits=2)

Fit using stan_glm and optimization

Using optimization with normal approximation and Pareto smoothed importance resampling provides coefficient standard errors, but also diagnostic whether normal approximation at the mode is appropriate.

tic()
fit3 <- stan_glm(y ~ ., data=fake, family=binomial(),
                algorithm='optimizing', init=0, draws = 16000, keep_every = 4) 
toc()
22.869 sec elapsed
#print(fit3, digits=2)

Logistic regression n=100K, p=100

Create fake data with 100,000 observations and p=100

SEED <- 1655
set.seed(SEED)
n <- 1e5
k <- 100
x <- rnorm(n)
xn <- matrix(rnorm(n*(k-1)),nrow=n)
a <- 2
b <- 3
sigma <- 1
y <- as.numeric(a + b*x + sigma*rnorm(n) > 0)
fake <- data.frame(x, xn, y)

Fit using glm

tic()
fit1 <- glm(y ~ ., data=fake, family=binomial())
toc()
12.729 sec elapsed
#display(fit1)

Fit using stan_glm and optimization

Using optimization with normal approximation and Pareto smoothed importance resampling provides coefficient standard errors, but also diagnostic whether normal approximation at the mode is appropriate.

tic()
fit3 <- stan_glm(y ~ ., data=fake, family=binomial(),
                 algorithm='optimizing', init=0) 
toc()
21.868 sec elapsed
summary(fit3, digits=2)

Model Info:
 function:     stan_glm
 family:       binomial [logit]
 formula:      y ~ .
 algorithm:    optimizing
 priors:       see help('prior_summary')
 observations: 100000
 predictors:   101

Estimates:
              Median   MAD_SD   10%   50%   90%
(Intercept)  3.61     0.03     3.58  3.61  3.65
x            5.45     0.05     5.39  5.45  5.50
X1           0.00     0.01    -0.02  0.00  0.01
X2           0.04     0.01     0.03  0.04  0.06
X3           0.00     0.01    -0.02  0.00  0.01
X4           0.00     0.01    -0.02  0.00  0.01
X5           0.02     0.01     0.00  0.02  0.03
X6           0.01     0.01     0.00  0.01  0.03
X7          -0.01     0.01    -0.03 -0.01  0.01
X8           0.01     0.01    -0.01  0.01  0.02
X9           0.00     0.01    -0.01  0.00  0.02
X10          0.00     0.01    -0.02  0.00  0.02
X11          0.01     0.01    -0.01  0.01  0.02
X12          0.00     0.01    -0.02  0.00  0.01
X13         -0.01     0.01    -0.03 -0.01  0.01
X14         -0.01     0.01    -0.02 -0.01  0.01
X15         -0.02     0.01    -0.04 -0.02  0.00
X16          0.01     0.01    -0.01  0.01  0.03
X17          0.02     0.01     0.00  0.02  0.04
X18         -0.01     0.01    -0.02 -0.01  0.01
X19          0.02     0.01     0.00  0.02  0.03
X20         -0.01     0.01    -0.02 -0.01  0.01
X21         -0.03     0.01    -0.05 -0.03 -0.01
X22         -0.03     0.01    -0.05 -0.03 -0.01
X23          0.01     0.01    -0.01  0.01  0.03
X24          0.00     0.01    -0.02  0.00  0.01
X25          0.01     0.01    -0.01  0.01  0.02
X26          0.00     0.01    -0.02  0.00  0.01
X27         -0.01     0.01    -0.02 -0.01  0.01
X28          0.01     0.01    -0.01  0.01  0.02
X29         -0.01     0.01    -0.02 -0.01  0.01
X30         -0.01     0.01    -0.03 -0.01  0.01
X31          0.00     0.01    -0.02  0.00  0.02
X32         -0.02     0.01    -0.03 -0.02  0.00
X33          0.00     0.01    -0.02  0.00  0.01
X34          0.01     0.01    -0.01  0.01  0.03
X35          0.00     0.01    -0.02  0.00  0.02
X36         -0.01     0.02    -0.03 -0.01  0.00
X37          0.01     0.01    -0.01  0.01  0.02
X38          0.00     0.01    -0.02  0.00  0.02
X39          0.01     0.01     0.00  0.01  0.03
X40          0.00     0.01    -0.01  0.00  0.02
X41         -0.03     0.01    -0.05 -0.03 -0.02
X42         -0.02     0.01    -0.03 -0.02  0.00
X43          0.00     0.01    -0.02  0.00  0.01
X44          0.00     0.01    -0.01  0.00  0.02
X45          0.00     0.01    -0.02  0.00  0.02
X46         -0.02     0.01    -0.03 -0.02  0.00
X47          0.00     0.01    -0.02  0.00  0.01
X48         -0.01     0.01    -0.02 -0.01  0.01
X49          0.01     0.01    -0.01  0.01  0.02
X50          0.00     0.01    -0.02  0.00  0.02
X51          0.00     0.01    -0.02  0.00  0.01
X52          0.01     0.01    -0.01  0.01  0.02
X53          0.00     0.01    -0.02  0.00  0.02
X54          0.00     0.01    -0.01  0.00  0.02
X55          0.01     0.01    -0.01  0.01  0.03
X56         -0.01     0.01    -0.03 -0.01  0.00
X57          0.00     0.01    -0.02  0.00  0.01
X58          0.01     0.01    -0.01  0.01  0.02
X59         -0.02     0.01    -0.04 -0.02 -0.01
X60         -0.01     0.01    -0.02 -0.01  0.01
X61         -0.02     0.01    -0.03 -0.02  0.00
X62         -0.01     0.01    -0.03 -0.01  0.00
X63          0.01     0.01    -0.01  0.01  0.02
X64          0.01     0.01    -0.01  0.01  0.03
X65          0.01     0.01    -0.01  0.01  0.03
X66          0.03     0.01     0.01  0.03  0.04
X67         -0.02     0.01    -0.04 -0.02  0.00
X68          0.01     0.01     0.00  0.01  0.03
X69         -0.01     0.01    -0.03 -0.01  0.01
X70          0.00     0.01    -0.02  0.00  0.01
X71          0.02     0.01     0.01  0.02  0.04
X72          0.00     0.01    -0.02  0.00  0.02
X73         -0.01     0.01    -0.03 -0.01  0.01
X74         -0.01     0.01    -0.03 -0.01  0.01
X75         -0.02     0.01    -0.03 -0.02  0.00
X76          0.01     0.01    -0.01  0.01  0.02
X77          0.00     0.01    -0.02  0.00  0.01
X78         -0.01     0.01    -0.03 -0.01  0.00
X79          0.01     0.01    -0.01  0.01