Sensitivity to Unobserved Confounding

1 Binary Outcome Sensitivity

1.1 Loading Obesity Paradox Data from MIMIC-IV

We begin by loading the data for investigating the obesity paradox. The key data that we need includes body mass index (BMI, treatment $X$), in-hospital mortality (outcome $Y$), and the confounding variables, including age and comorbidities in the Elixhauser score.

src <- "miiv" # MIMIC-IV data
data <- load_concepts(c("death", "bmi_all", "elix_components", "age"), src, 
                      id_type = "patient", verbose = FALSE)

data <- data[is.na(death), death := FALSE]
data[, c(meta_vars(data), "elix_components") := NULL]

# focus on patients with BMI recorded, and ICD code available
data <- data[complete.cases(data)]

head(data)

We remove Obesity/Weight Loss comorbidities, as these are directly related to the treatment itself. We further combine diabetes with and without complications. We focus only on patients who have a BMI in the range $[18.5, 35)\, kg/m^2$, meaning that we exclude underweight ($<18.5$) and class II and III obese $(\geq 35)$ patients.

# remove Obesity and Weight Loss as they are closely related to BMI
data[, c("Obesity", "WeightLoss") := NULL]

# combine diabetes and diabetes with complications
data[, DM := pmax(DM, DMcx)]
data[, DMcx := NULL]

# remove underweight and class II/III obese patients
data <- data[bmi_all >= 18.5 & bmi_all < 35]

1.2 Variable Selection

For performing the analysis, to reduce dimensionality, we focus on comorbidities that are predictive for death.

# regress death onto comorbidities and age in a logistic model 
logreg <- glm(death ~ . - bmi_all + (bmi_all < 25), data = data, 
                family = "binomial")
# get the coefficients
lr_coef <- summary(logreg)$coefficients
# order the variables by coefficient size
lr_coef <- lr_coef[order(lr_coef[, 1], decreasing = TRUE), ]

# focus on comorbidities that (significantly) increase mortality 
sel <- lr_coef[, 1] > 0 & lr_coef[, 4] < 0.05
cmb_sel <- setdiff(rownames(lr_coef)[sel], c("age"))

data[, bmi_ind := (bmi_all < 25)]

For simplicity, we subset the data to include the five most predictive comorbidities.

dat_fin <- data[, c("death", "bmi_ind", "age", cmb_sel[1:5]), with=FALSE] 
head(dat_fin)

1.3 Computing TRR & E-value

We first write some helper functions for fitting an xgboost model.

cv_xgb <- function(df, y, weights = NULL, ...) {

  dtrain <- xgb.DMatrix(data = as.matrix(df), label = y, weight = weights)

  binary <- all(y %in% c(0, 1))
  if (binary) {

    params <- list(objective = "binary:logistic", eval_metric = "logloss")
  } else {

    params <- list(objective = "reg:squarederror", eval_metric = "rmse")
  }
  
  cv <- xgb.cv(
    params = params,
    data = dtrain,
    nrounds = 1000,
    nfold = 5,
    early_stopping_rounds = 10,
    prediction = TRUE,
    verbose = FALSE, ...
  )

  xgb <- xgb.train(
    params = params,
    data = dtrain,
    nrounds = cv$early_stop$best_iteration,
    verbose = FALSE, ...
  )
  attr(xgb, "binary") <- binary

  xgb
}

pred_xgb <- function(xgb, df_test, intervention = NULL, X = "X") {

  if (!is.null(intervention)) {

    df_test[[X]] <- intervention
  }

  predict(xgb, as.matrix(df_test))
}

Then, we perform 5-fold cross-fitting. We leave one fold out, and train xgboost models on the remaining folds. Our goal is to compute the conditional probabilities \[ P(Y = 1 \mid X, Z) \] which allows us to compute the estimated total risk ratio (TRR) through \[ \frac{P(Y = 1 \mid do(X = x_1)) } {P(Y = 1 \mid do(X = x_0)) } = \frac{\sum_{i} P(Y = 1 \mid X=x_1, Z=z_i)}{\sum_i P(Y=1\mid X=x_0, Z=z_i)}.\] Furthermore, we also compute conditional probabilities \[ P(Z_j \mid X, Z_{-j})\] which we use later for reasoning about the strength of a possible unobserved confounder and the associated parameters $RR_{XU}, RR_{UY}$.

z_bin <- cmb_sel[1:5]
x_nm <- "bmi_ind"
y_nm <- "death"

k <- 5L
set.seed(1)
fold_id <- sample.int(k, nrow(dat_fin), replace = TRUE)

# predictors for outcome model (Y | X,Z)
df_y_all <- dat_fin[, -c(y_nm), with = FALSE]
y_all <- as.integer(dat_fin[[y_nm]])

# storage (accumulate held-out sums across folds)
s_y_x1 <- 0; s_y_x0 <- 0; n_tot <- 0L

s_z1_x1 <- setNames(numeric(length(z_bin)), z_bin)  # sum P(Zj=1 | X=1, Z_-j)
s_z1_x0 <- setNames(numeric(length(z_bin)), z_bin)  # sum P(Zj=1 | X=0, Z_-j)
s_z0_x1 <- setNames(numeric(length(z_bin)), z_bin)  # sum P(Zj=0 | X=1, Z_-j)
s_z0_x0 <- setNames(numeric(length(z_bin)), z_bin)  # sum P(Zj=0 | X=0, Z_-j)

s_y_x1_z1 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=1,Zj=1,Z_-j]
s_y_x1_z0 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=1,Zj=0,Z_-j]
s_y_x0_z1 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=0,Zj=1,Z_-j]
s_y_x0_z0 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=0,Zj=0,Z_-j)

# cross-validated approach
for (fold in seq_len(k)) {

  idx_te <- which(fold_id == fold)
  idx_tr <- which(fold_id != fold)
  n_k <- length(idx_te)
  n_tot <- n_tot + n_k

  # outcome model: fit on train, predict risks on held-out under do(X=1) and do(X=0)
  mod_y <- cv_xgb(df_y_all[idx_tr], y_all[idx_tr])

  df_te <- copy(df_y_all[idx_te])
  p_x1 <- pred_xgb(mod_y, df_te, intervention = 1L, X = x_nm)
  p_x0 <- pred_xgb(mod_y, df_te, intervention = 0L, X = x_nm)

  s_y_x1 <- s_y_x1 + sum(p_x1)  # sum_i \hat P(Y=1 | do(X=1), Z_i) over held-out i
  s_y_x0 <- s_y_x0 + sum(p_x0)  # sum_i \hat P(Y=1 | do(X=0), Z_i) over held-out i

  # per-confounder models / toggles
  for (zj in z_bin) {

    # selection model: Zj | X, Z_-j  (fit on train, predict on held-out)
    df_x_all <- dat_fin[, c(x_nm, setdiff(names(dat_fin), c(y_nm, zj, x_nm))), with = FALSE]
    y_z_all <- as.integer(dat_fin[[zj]])

    mod_z <- cv_xgb(df_x_all[idx_tr], y_z_all[idx_tr])

    df_x_te <- copy(df_x_all[idx_te])
    q1 <- pred_xgb(mod_z, df_x_te, intervention = 1L, X = x_nm) # \hat P(Zj=1 | X=1, Z_-j)
    q0 <- pred_xgb(mod_z, df_x_te, intervention = 0L, X = x_nm) # \hat P(Zj=1 | X=0, Z_-j)

    s_z1_x1[zj] <- s_z1_x1[zj] + sum(q1)
    s_z1_x0[zj] <- s_z1_x0[zj] + sum(q0)
    s_z0_x1[zj] <- s_z0_x1[zj] + sum(1 - q1)  # \hat P(Zj=0 | X=1, Z_-j)
    s_z0_x0[zj] <- s_z0_x0[zj] + sum(1 - q0)  # \hat P(Zj=0 | X=0, Z_-j)

    # outcome strength: Y | X, Z. Using the *same* mod_y, toggle (X, Zj) on held-out.
    df1 <- copy(df_te); df1[[x_nm]] <- 1L; df1[[zj]] <- 1L
    df0 <- copy(df_te); df0[[x_nm]] <- 1L; df0[[zj]] <- 0L
    s_y_x1_z1[zj] <- s_y_x1_z1[zj] + sum(pred_xgb(mod_y, df1))
    s_y_x1_z0[zj] <- s_y_x1_z0[zj] + sum(pred_xgb(mod_y, df0))

    df1 <- copy(df_te); df1[[x_nm]] <- 0L; df1[[zj]] <- 1L
    df0 <- copy(df_te); df0[[x_nm]] <- 0L; df0[[zj]] <- 0L
    s_y_x0_z1[zj] <- s_y_x0_z1[zj] + sum(pred_xgb(mod_y, df1))
    s_y_x0_z0[zj] <- s_y_x0_z0[zj] + sum(pred_xgb(mod_y, df0))
  }
}

We first compute the TRR, and the E-value:

# marginal RR^{obs} from cross-fitted risks
trr_cv <- (s_y_x1 / n_tot) / (s_y_x0 / n_tot)
trr_cv

## [1] 1.084788

e_val_cv <- trr_cv + sqrt(trr_cv * (trr_cv - 1))
e_val_cv

## [1] 1.388064

To explain away the effect, an unobserved confounder $U$ would have to (i) incurr a 1.39-fold increase in the risk ratio for either $U=1$ or $U=0$ from a $x_0 \to x_1$ transition; (ii) be associated with a 1.39-fold increase in the risk-ratio for $Y$ from a $U=0 \to U=1$ or a $U=1 \to U=0$ change.

1.4 Heuristic: Leave-One-Confounder-Out

We next try to get a sense of how large the values $RR_{XU}, RR_{UY}$ may be in practice. For this, we proceed as follows. For each $Z_j \in \{Z_1, \dots, Z_5\}$, we imagine that $Z_j$ is the unobserved confounder, and we quantify the values of $RR_{XZ_j}, RR_{Z_jY}$. Specifically, we compute the following values:

\[ \frac{\sum_{z_{-j}}P(Z_j = z_j \mid X=x_1, Z_{-j}=z_{-j})P(Z_{-j}=z_{-j})}{\sum_{z_{-j}}P(Z_j = z_j \mid X=x_0, Z_{-j}=z_{-j})P(Z_{-j}=z_{-j})} \] for $z_j \in \{0, 1\}$; this quantifies the increase in the risk-ratio for $Z_j=z_j$ from a $x_0 \to x_1$ transition, averaged across $Z_{-j}$ values (note that we are not taking the worst-case value here).

Furthermore, similarly we compute: \[\begin{align} \frac{\sum_{z_{-j}} P(Y=1 \mid X=x, Z_j = z_j, Z_{-j} = z_{-j})}{\sum_{z_{-j}} P(Y=1 \mid X=x, Z_j = 1-z_j, Z_{-j} = z_{-j})} \end{align}\] for $z_j \in \{0, 1\}, x \in \{0, 1\}$. This allows us to quantify the largest increase in the risk-ratio for $Y=1$ from a $Z_j = 0 \to Z_j = 1$ or a $Z_j = 1 \to Z_j = 0$ transition across values of $X=x$, averaged across $Z_{-j}$ values (note that we are not taking the worst-case value here). Intuitively, this corresponds to the $RR_{Z_jY}$ parameter.

# RR_{X Zj}: how much Zj's distribution changes across treatment groups (X->Z_j strength)
rr_xz <- vapply(z_bin, function(zj) {
  rr1 <- (s_z1_x1[zj] / n_tot) / (s_z1_x0[zj] / n_tot)  # Zj=1 contrast across X
  rr0 <- (s_z0_x1[zj] / n_tot) / (s_z0_x0[zj] / n_tot)  # Zj=0 contrast across X
  max(rr1, rr0)
}, numeric(1))

# RR_{Y Zj | X}: how much toggling Zj changes risk at fixed X (Z_j->Y strength)
rr_yz <- vapply(z_bin, function(zj) {
  rr_x1 <- (s_y_x1_z1[zj] / n_tot) / (s_y_x1_z0[zj] / n_tot) # within X=1
  rr_x0 <- (s_y_x0_z1[zj] / n_tot) / (s_y_x0_z0[zj] / n_tot) # within X=0
  max(rr_x1, rr_x0, 1/rr_x1, 1/rr_x0)
}, numeric(1))

# inverse bias factor: (r_xz+r_yz-1)/(r_xz*r_yz)
uc_sum <- data.table(conf = z_bin, rr_xz = rr_xz, rr_yz = rr_yz)
uc_sum[, inv_B := (rr_xz + rr_yz - 1) / (rr_xz * rr_yz)]
uc_sum[, lower_bound := inv_B * trr_cv]
setorder(uc_sum, -rr_xz)
uc_sum

The summary table is informative. An unobserved confounder approximately as strong as metastatic cancer would explain away the effect we are seeing. However, a UC as strong as other confounders (fluids & electrolytes disorder, coagulopathy, arrythmia, liver disease) would not explain away the effect we are seeing.

2 Linear Continuous Sensitivity

In the VAHCS study, a cohort of adolescents was tracked over time, in order to investigate the impact of marijuana usage on mental health.

load(file.path(root, "data/vahcs/data/vahcs_data.rda"))
dat <- dat[complete.cases(dat)]
vars_fact <- setdiff(names(dat), c("mhs"))
for (var in vars_fact) dat[, c(var) := as.numeric(get(var)) - 1]

# VAHCS study data
head(dat)

Variable thc encodes whether the individuals use marijuana. The outcome mhs represents the Clinical Interview Schedule-Revised (CIS-R) score, which is standardized and log-transformed. Higher values of mhs represent worse mental health. To investigate the impact of marijuana, wee first regress the outcome onto all other variables.

lmod <- lm(mhs ~ ., data = dat)
summary(lmod)

## 
## Call:
## lm(formula = mhs ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2997 -0.5377  0.1730  0.6472  2.5517 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.51264    0.06203  -8.264 8.52e-16 ***
## pari         0.04423    0.07741   0.571   0.5680    
## pard         0.11451    0.09667   1.184   0.2367    
## asb          0.11776    0.12953   0.909   0.3636    
## cmd          0.72994    0.07589   9.618  < 2e-16 ***
## alc          0.01921    0.08652   0.222   0.8244    
## thc          0.28744    0.14052   2.046   0.0412 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9108 on 621 degrees of freedom
## Multiple R-squared:  0.1728, Adjusted R-squared:  0.1648 
## F-statistic: 21.61 on 6 and 621 DF,  p-value: < 2.2e-16

As we can see, when assuming linearity, the effect of thc increases mhs by NA. We next investigate how this finding could be affected by possible unobserved confounding. We start by extracting the t-value and the degrees of freedom, and then computing

t_val <- summary(lmod)$coeff["thc", "t value"]
df <- lmod$df.residual

f <- t_val / sqrt(df)

This further allows us to compute the robustness values that ensures that the relative bias is less than 100%, that is, the maximal value $r^2$ of

\[ R^2_{Y\sim U\mid Z} = R^2_{X\sim U\mid Z} = r^2 \]

such that the effect estimate does not change sign. We compute the robustness value explicitly

rv <- 1 / 2 * (sqrt(f^4 + 4 * f^2) - f^2)
rv

## [1] 0.07878507

Therefore, a confounder with $ R^2_{YUZ} = R^2_{XUZ} > 0.08$ could possibly change the sign of our effect estimate. We verify our computation by using the sensmakr R-package:

rv_pkg <- sensemakr(lmod, "thc")$sensitivity_stats$rv_q

rv_pkg - rv

## [1] 0
## Parameters: q = 1, alpha = 1, invert = FALSE

2.1 Leave-One-Confounder-Out (LOCO) Heuristic

In the final step, we check the values obtained for $ R^2_{YUZ} = R^2_{XUZ} > 0.08$ when leaving out the observed confounders one-by-one.

x_nm <- "thc"
y_nm <- "mhs"

x <- dat[[x_nm]]
vars_z <- setdiff(names(dat), c(y_nm, x_nm))

loco <- data.table(
  z_drop = vars_z,
  r2_xu = NA_real_,
  r2_yu = NA_real_
)

for (i in seq_along(vars_z)) {

  zj <- vars_z[i]
  z_oth <- setdiff(vars_z, zj)

  # --- R^2_{X ~ Z_j | Z_-j}
  fml_xz <- as.formula(paste0("x ~ ", paste(z_oth, collapse = " + ")))
  fml_uz <- as.formula(paste0(zj, " ~ ", paste(z_oth, collapse = " + ")))

  x_res <- resid(lm(fml_xz, data = dat))
  u_res <- resid(lm(fml_uz, data = dat))

  loco[i, r2_xu := summary(lm(u_res ~ x_res))$r.squared]

  # R^2_{Y ~ Z_j | X, Z_-j}
  fml_yz <- as.formula(paste0(y_nm, " ~ ", x_nm, " + ", paste(z_oth, collapse = " + ")))
  fml_uz2 <- as.formula(paste0(zj, " ~ ", x_nm, " + ", paste(z_oth, collapse = " + ")))

  y_res <- resid(lm(fml_yz, data = dat))
  u_res2 <- resid(lm(fml_uz2, data = dat))

  loco[i, r2_yu := summary(lm(y_res ~ u_res2))$r.squared]
}

# compute the bias factor
loco[, bias_fact := sqrt( r2_xu * r2_yu / (1 - r2_xu))]
loco[, `:=`(r2_xu = round(r2_xu, 4), r2_yu = round(r2_yu, 4))]

setorder(loco, -bias_fact)
print(loco)

##    z_drop  r2_xu  r2_yu    bias_fact
##    <char>  <num>  <num>        <num>
## 1:    cmd 0.0037 0.1297 0.0218990925
## 2:    asb 0.0782 0.0013 0.0106187062
## 3:   pard 0.0314 0.0023 0.0085467924
## 4:    alc 0.1176 0.0001 0.0032518810
## 5:   pari 0.0001 0.0005 0.0002043617

bias_fact_crit <- sqrt(rv^2 / (1-rv))
bias_fact_crit

## [1] 0.08208493

As we can see, bias factor values above 0.082 can flip our findings. Removing any of the observed confounders does not result in bias factor values as high as this. The largest bias factor is observed for cmd, which captures the presence of depression/anxiety during adolescence. As can be seen, cmd has a partial $R^2$ value with $Y$ larger than our robustness value, but the $R^2$ for $X$ is smaller. For anti-social behavior asb, the opposite is observed. It has a partial $R^2$ value with $X$ close to our robustness value, but the partial $R^2$ with $Y$ is smaller. In conclusion, an unobserved confounder would need to have larger partial $R^2$ values than we obtain by leaving out observed confounders.