### complete-case analysis fails easily when data is not MCAR
expit <- function(x) exp(x) / (1 + exp(x))
n <- 10^4
X <- rnorm(n)
Y <- X + rnorm(n)
R <- rbinom(n, 1, prob = expit(X))
Ys <- Y
Ys[R==1] <- NA
df <- data.frame(X=X, Ys = Ys)
mean(df[complete.cases(df), ]$Ys)## [1] -0.3741019
We now compare complete-case analysis, MICE, and the recovery result from the lectures. We look at the deadlock m-graph, given by the following SCM:
\[ \begin{align} X &\gets \text{Bern}(0.5) \\ Y &\gets \begin{cases} \text{Bern}(2a) \quad \;\,\text{ if } X =1 \\ \text{Bern}(1-2a) \text{ if } X =0 \end{cases} \\ R_X &\gets \text{Bern}(0.1 + \beta Y) \\ R_Y &\gets \text{Bern}(0.1 + \alpha X) \end{align} \]
set.seed(2026)
a <- 0.4
alf <- 0.5
bet <- 0.5
rho <- 4 * a - 1
# generate the data
n <- 10^4
nrep <- 100
res <- NULL
for (i in seq_len(nrep)) {
# sample X, Y
X <- rbinom(n, 1, prob = 0.5)
Y <- rbinom(n, 1, prob = ifelse(X, 2 * a, 1 - 2 * a))
# sample the missingness pattern
Rx <- rbinom(n, 1, 0.1 + alf * Y)
Ry <- rbinom(n, 1, 0.1 + bet * X)
# generate the available data P(V*, R)
Xs <- X
Ys <- Y
Xs[as.logical(Rx)] <- NA
Ys[as.logical(Ry)] <- NA
data <- data.frame(Xs, Ys)
# apply MICE with 20 imputations
mi_dat <- mice(data = data, m = 20, print=FALSE)
mirh <- c()
for (mi in seq_len(mi_dat$m)) {
mirh <- c(mirh, cor(complete(mi_dat, action = mi))[2])
}
# compute the R-factors for the recovery result
rx_fact <- mean((Rx == 0)[Ry == 0 & Ys == 1])
ry_fact <- mean((Ry == 0)[Rx == 0 & Xs == 1])
# estimate P(X = 1, Y = 1) based on the recovery result
ahat <- mean(rowMeans(data[complete.cases(data), ]) == 1) *
mean(Rx == 0 & Ry == 0) / (rx_fact * ry_fact)
# get correlation estimates \rho for the three methods
rho_mg <- 4 * ahat - 1
rho_mice <- mean(mirh)
rho_cca <- cor(data[complete.cases(data), ])[2]
res <- rbind(
res,
data.frame(mg_bias = rho_mg - rho, mice_bias = rho_mice - rho,
cca_bias = rho_cca - rho)
)
}
res <- as.data.table(res)
res_long <- melt(res, measure.vars = 1:3)
# plot the bias distribution over repetitions
ggplot(res_long, aes(x=value, fill = variable)) +
geom_density(alpha = 0.4) + geom_vline(xintercept = 0, color = "red",
linetype = "dashed") +
theme_bw() + scale_fill_discrete(name = "Method",
labels = c("m-graph", "MICE", "CCA")) +
xlab("Bias") + ylab("Distribution Density")# compute the z-values for the null hypothesis H_0: no bias
res_long[, list(mu = mean(value), sd = sd(value)), by = "variable"][,
list(z_value = mu / sd, method = variable)
]In the VAHCS study, a cohort of adolescents was tracked over time, in order to investigate the impact of marijuana usage on mental health.
# helper function for estimate P(R_V = 0 | ...)
r_prob <- function(form, dat, targ_dat, method = "glm") {
fit <- glm(form, family = "binomial", data = dat)
prob <- predict(fit, newdata = targ_dat, type = "response")
prob
}
set.seed(2026)
nboot <- 250 # 250 bootstrap repetitions
res <- c()
for (b in seq_len(nboot)) {
boot_idx <- if (b == 1) seq_len(nrow(dat)) else sample.int(nrow(dat), replace = TRUE)
bdat <- dat[boot_idx]
# P(R_X = 0 | Z_1, Z_2, R_{Z_2} = 0)
prx <- r_prob(!is.na(thc) ~ pari + pard + asb + cmd + alc,
bdat[!is.na(cmd) & !is.na(alc)], bdat[complete.cases(bdat)])
# P(R_{Z_2} = 0 | Z_1, X, R_X = 0)
prz2 <- r_prob(!is.na(cmd) & !is.na(alc) ~ pari + pard + asb + thc,
bdat[!is.na(thc)], bdat[complete.cases(bdat)])
# P(R_Y = 0 | Z_1, Z_2, X, R_X = 0, R_{Z_2} = 0)
pry <- r_prob(!is.na(mhs) ~ pari + pard + asb + cmd + alc + thc,
bdat[!is.na(thc) & !is.na(cmd) & !is.na(alc)],
bdat[complete.cases(bdat)])
r_factor <- 1 / (prx * prz2 * pry)
# causal effect estimation
fit_rw <- glm(thc ~ . - mhs, data = bdat[complete.cases(bdat)],
family = "quasibinomial",
weights = r_factor)
fit_cca <- glm(thc ~ . - mhs, data = bdat[complete.cases(bdat)],
family = "binomial")
# extract X, Y values
x <- bdat[complete.cases(bdat)]$thc == 1
y <- bdat[complete.cases(bdat)]$mhs
# effect estimate via R-factors
px_rw <- fit_rw$fitted.values
te_rw <- 1 / sum(1 / px_rw) * sum( x * y / px_rw ) -
1 / sum(1 / (1 - px_rw)) * sum( (1-x) * y / (1-px_rw))
# effect estimate via CCA
px_cca <- fit_cca$fitted.values
te_cca <- 1 / sum(1 / px_cca) * sum( x * y / px_cca ) -
1 / sum(1 / (1 - px_cca)) * sum( (1-x) * y / (1-px_cca))
res <- rbind(
res,
data.table(boot = b, method = "r_fact", te = te_rw),
data.table(boot = b, method = "cca", te = te_cca)
)
}# aggregate the bootstrap spread - question: what does this line do?
spread <- res[, list(te = 2 * te[boot == 1] - te[boot != 1],
boot = boot[boot != 1]), by = c("method")]
# obtain the lower & upper confidence intervals
spread[, list(lower = quantile(te, prob = 0.025),
upper = quantile(te, prob = 0.975)), by = "method"]# visualize the spread of estimated values
ggplot(spread, aes(x = te, fill = method)) + geom_density(alpha = 0.4) +
theme_bw() + xlab("Total Effect Estimate") +
ylab("Bootstrap Distribution Density") +
scale_fill_discrete(name = "Method", labels = c("Complete-Case", "m-graph")) +
theme(legend.position = "inside", legend.position.inside = c(0.8, 0.8),
legend.box.background = element_rect())Zhang et. al. discuss the assumptions, about all arrows from the observables \(V\) to the missingess variables \(R\).
| \(V_i\) → \(V_j\) | (\(R_{C3}\)) | (\(R_{C4}\)) | (\(R_X\)) | (\(R_Y\)) |
|---|---|---|---|---|
| \((C_3, C_4)\) | Likely — failure to return form can be due to mental health issues | Likely — failure to return form can be due to mental health issues | Likely — failure to return form can be due to mental health issues or alcohol use | Likely — failure to return form / wave-7 attrition can be due to mental health issues |
| \((X)\) | Likely — failure to return form can be due to cannabis use | Likely — failure to return form can be due to cannabis use | Likely — failure to answer cannabis use due to illegality/stigma; failure to return form can be due to cannabis use | Likely — failure to return form / wave-7 attrition can be due to cannabis use |
| \((Y)\) | Not likely — missingness in confounders preceded outcome by ≥6 months | Not likely — missingness in confounders preceded outcome by ≥6 months | Not likely — missingness in exposure preceded outcome by ≥6 months | Likely — failure to return form can be due to mental health issues |
Questions: