1 Part A: Complete-Case Analysis not a Panacea

### complete-case analysis fails easily when data is not MCAR
expit <- function(x) exp(x) / (1 + exp(x))

n <- 10^4
X <- rnorm(n)
Y <- X + rnorm(n)

R <- rbinom(n, 1, prob = expit(X))

Ys <- Y
Ys[R==1] <- NA

df <- data.frame(X=X, Ys = Ys)

mean(df[complete.cases(df), ]$Ys)
## [1] -0.3741019

2 Part B: Deadlock Example & When MICE Fails

We now compare complete-case analysis, MICE, and the recovery result from the lectures. We look at the deadlock m-graph, given by the following SCM:

\[ \begin{align} X &\gets \text{Bern}(0.5) \\ Y &\gets \begin{cases} \text{Bern}(2a) \quad \;\,\text{ if } X =1 \\ \text{Bern}(1-2a) \text{ if } X =0 \end{cases} \\ R_X &\gets \text{Bern}(0.1 + \beta Y) \\ R_Y &\gets \text{Bern}(0.1 + \alpha X) \end{align} \]

set.seed(2026)
a <- 0.4
alf <- 0.5
bet <- 0.5

rho <- 4 * a - 1

# generate the data
n <- 10^4
nrep <- 100
res <- NULL
for (i in seq_len(nrep)) {
  
  # sample X, Y
  X <- rbinom(n, 1, prob = 0.5)
  Y <- rbinom(n, 1, prob = ifelse(X, 2 * a, 1 - 2 * a))
  
  # sample the missingness pattern
  Rx <- rbinom(n, 1, 0.1 + alf * Y)
  Ry <- rbinom(n, 1, 0.1 + bet * X)
  
  # generate the available data P(V*, R)
  Xs <- X
  Ys <- Y
  Xs[as.logical(Rx)] <- NA
  Ys[as.logical(Ry)] <- NA
  data <- data.frame(Xs, Ys)
  
  # apply MICE with 20 imputations
  mi_dat <- mice(data = data, m = 20, print=FALSE)
  mirh <- c()
  for (mi in seq_len(mi_dat$m)) {
    
    mirh <- c(mirh, cor(complete(mi_dat, action = mi))[2])
  }
  
  # compute the R-factors for the recovery result
  rx_fact <- mean((Rx == 0)[Ry == 0 & Ys == 1])
  ry_fact <- mean((Ry == 0)[Rx == 0 & Xs == 1])
  
  # estimate P(X = 1, Y = 1) based on the recovery result
  ahat <- mean(rowMeans(data[complete.cases(data), ]) == 1) *
    mean(Rx == 0 & Ry == 0) / (rx_fact * ry_fact)
  
  # get correlation estimates \rho for the three methods
  rho_mg <- 4 * ahat - 1
  rho_mice <- mean(mirh)
  rho_cca <- cor(data[complete.cases(data), ])[2]
  res <- rbind(
    res, 
    data.frame(mg_bias = rho_mg - rho, mice_bias = rho_mice - rho, 
               cca_bias = rho_cca - rho)
  )
}

res <- as.data.table(res)
res_long <- melt(res, measure.vars = 1:3)

# plot the bias distribution over repetitions
ggplot(res_long, aes(x=value, fill = variable)) +
  geom_density(alpha = 0.4) + geom_vline(xintercept = 0, color = "red",
                                         linetype = "dashed") +
  theme_bw() + scale_fill_discrete(name = "Method", 
                                   labels = c("m-graph", "MICE", "CCA")) +
  xlab("Bias") + ylab("Distribution Density")

# compute the z-values for the null hypothesis H_0: no bias
res_long[, list(mu = mean(value), sd = sd(value)), by = "variable"][,
  list(z_value = mu / sd, method = variable)
]

3 Part C: VAHCS Study

In the VAHCS study, a cohort of adolescents was tracked over time, in order to investigate the impact of marijuana usage on mental health.

load(file.path(root, "data/vahcs/data/vahcs_data.rda"))

# VAHCS study data
head(dat)
# helper function for estimate P(R_V = 0 | ...)
r_prob <- function(form, dat, targ_dat, method = "glm") {
  
  fit <- glm(form, family = "binomial", data = dat)
  prob <- predict(fit, newdata = targ_dat, type = "response")
  
  prob
}

set.seed(2026)
nboot <- 250 # 250 bootstrap repetitions
res <- c()
for (b in seq_len(nboot)) {
  
  boot_idx <- if (b == 1) seq_len(nrow(dat)) else sample.int(nrow(dat), replace = TRUE)
  bdat <- dat[boot_idx]
  
  # P(R_X = 0 | Z_1, Z_2, R_{Z_2} = 0)
  prx <- r_prob(!is.na(thc) ~ pari + pard + asb + cmd + alc, 
                bdat[!is.na(cmd) & !is.na(alc)], bdat[complete.cases(bdat)])
  
  # P(R_{Z_2} = 0 | Z_1, X, R_X = 0)
  prz2 <- r_prob(!is.na(cmd) & !is.na(alc) ~ pari + pard + asb + thc, 
                 bdat[!is.na(thc)], bdat[complete.cases(bdat)])
  
  # P(R_Y = 0 | Z_1, Z_2, X, R_X = 0, R_{Z_2} = 0)
  pry <- r_prob(!is.na(mhs) ~ pari + pard + asb + cmd + alc + thc, 
                bdat[!is.na(thc) & !is.na(cmd) & !is.na(alc)],
                bdat[complete.cases(bdat)])
  
  r_factor <- 1 / (prx * prz2 * pry)
  
  # causal effect estimation
  fit_rw <- glm(thc ~ . - mhs, data = bdat[complete.cases(bdat)], 
                family = "quasibinomial",
                weights = r_factor)
  
  fit_cca <- glm(thc ~ . - mhs, data = bdat[complete.cases(bdat)], 
                 family = "binomial")
  
  # extract X, Y values
  x <- bdat[complete.cases(bdat)]$thc == 1
  y <- bdat[complete.cases(bdat)]$mhs
  
  # effect estimate via R-factors
  px_rw <- fit_rw$fitted.values
  te_rw <- 1 / sum(1 / px_rw) * sum( x * y / px_rw ) - 
           1 / sum(1 / (1 - px_rw)) * sum( (1-x) * y / (1-px_rw))
  
  # effect estimate via CCA
  px_cca <- fit_cca$fitted.values
  te_cca <- 1 / sum(1 / px_cca) * sum( x * y / px_cca ) - 
            1 / sum(1 / (1 - px_cca)) * sum( (1-x) * y / (1-px_cca))
  
  res <- rbind(
    res, 
    data.table(boot = b, method = "r_fact", te = te_rw),
    data.table(boot = b, method = "cca", te = te_cca)
  )
}
# aggregate the bootstrap spread - question: what does this line do?
spread <- res[, list(te = 2 * te[boot == 1] - te[boot != 1], 
                     boot = boot[boot != 1]), by = c("method")]

# obtain the lower & upper confidence intervals
spread[, list(lower = quantile(te, prob = 0.025),
              upper = quantile(te, prob = 0.975)), by = "method"]
# visualize the spread of estimated values
ggplot(spread, aes(x = te, fill = method)) + geom_density(alpha = 0.4) + 
  theme_bw() + xlab("Total Effect Estimate") +
  ylab("Bootstrap Distribution Density") +
  scale_fill_discrete(name = "Method", labels = c("Complete-Case", "m-graph")) +
  theme(legend.position = "inside", legend.position.inside = c(0.8, 0.8),
        legend.box.background = element_rect())

3.1 Discussion of Assumptions

Zhang et. al. discuss the assumptions, about all arrows from the observables \(V\) to the missingess variables \(R\).

\(V_i\)\(V_j\) (\(R_{C3}\)) (\(R_{C4}\)) (\(R_X\)) (\(R_Y\))
\((C_3, C_4)\) Likely — failure to return form can be due to mental health issues Likely — failure to return form can be due to mental health issues Likely — failure to return form can be due to mental health issues or alcohol use Likely — failure to return form / wave-7 attrition can be due to mental health issues
\((X)\) Likely — failure to return form can be due to cannabis use Likely — failure to return form can be due to cannabis use Likely — failure to answer cannabis use due to illegality/stigma; failure to return form can be due to cannabis use Likely — failure to return form / wave-7 attrition can be due to cannabis use
\((Y)\) Not likely — missingness in confounders preceded outcome by ≥6 months Not likely — missingness in confounders preceded outcome by ≥6 months Not likely — missingness in exposure preceded outcome by ≥6 months Likely — failure to return form can be due to mental health issues

Questions:

  • What is the conclusion, is our analysis valid?
  • Can the effect be identified under the likely missingness pattern?
  • How could the data be analyzed further?