We begin by loading the data for investigating the obesity paradox. The key data that we need includes body mass index (BMI, treatment \(X\)), in-hospital mortality (outcome \(Y\)), and the confounding variables, including age and comorbidities in the Elixhauser score.
src <- "miiv" # MIMIC-IV data
data <- load_concepts(c("death", "bmi_all", "elix_components", "age"), src,
id_type = "patient", verbose = FALSE)
data <- data[is.na(death), death := FALSE]
data[, c(meta_vars(data), "elix_components") := NULL]
# focus on patients with BMI recorded, and ICD code available
data <- data[complete.cases(data)]
head(data)We remove Obesity/Weight Loss comorbidities, as these are directly related to the treatment itself. We further combine diabetes with and without complications. We focus only on patients who have a BMI in the range \([18.5, 35)\, kg/m^2\), meaning that we exclude underweight (\(<18.5\)) and class II and III obese \((\geq 35)\) patients.
# remove Obesity and Weight Loss as they are closely related to BMI
data[, c("Obesity", "WeightLoss") := NULL]
# combine diabetes and diabetes with complications
data[, DM := pmax(DM, DMcx)]
data[, DMcx := NULL]
# remove underweight and class II/III obese patients
data <- data[bmi_all >= 18.5 & bmi_all < 35]For performing the analysis, to reduce dimensionality, we focus on comorbidities that are predictive for death.
# regress death onto comorbidities and age in a logistic model
logreg <- glm(death ~ . - bmi_all + (bmi_all < 25), data = data,
family = "binomial")
# get the coefficients
lr_coef <- summary(logreg)$coefficients
# order the variables by coefficient size
lr_coef <- lr_coef[order(lr_coef[, 1], decreasing = TRUE), ]
# focus on comorbidities that (significantly) increase mortality
sel <- lr_coef[, 1] > 0 & lr_coef[, 4] < 0.05
cmb_sel <- setdiff(rownames(lr_coef)[sel], c("age"))
data[, bmi_ind := (bmi_all < 25)]For simplicity, we subset the data to include the five most predictive comorbidities.
We first write some helper functions for fitting an
xgboost model.
cv_xgb <- function(df, y, weights = NULL, ...) {
dtrain <- xgb.DMatrix(data = as.matrix(df), label = y, weight = weights)
binary <- all(y %in% c(0, 1))
if (binary) {
params <- list(objective = "binary:logistic", eval_metric = "logloss")
} else {
params <- list(objective = "reg:squarederror", eval_metric = "rmse")
}
cv <- xgb.cv(
params = params,
data = dtrain,
nrounds = 1000,
nfold = 5,
early_stopping_rounds = 10,
prediction = TRUE,
verbose = FALSE, ...
)
xgb <- xgb.train(
params = params,
data = dtrain,
nrounds = cv$early_stop$best_iteration,
verbose = FALSE, ...
)
attr(xgb, "binary") <- binary
xgb
}
pred_xgb <- function(xgb, df_test, intervention = NULL, X = "X") {
if (!is.null(intervention)) {
df_test[[X]] <- intervention
}
predict(xgb, as.matrix(df_test))
}Then, we perform 5-fold cross-fitting. We leave one fold out, and
train xgboost models on the remaining folds. Our goal is to
compute the conditional probabilities \[ P(Y
= 1 \mid X, Z) \] which allows us to compute the estimated total
risk ratio (TRR) through \[ \frac{P(Y = 1
\mid do(X = x_1)) } {P(Y = 1 \mid do(X = x_0)) } = \frac{\sum_{i} P(Y =
1 \mid X=x_1, Z=z_i)}{\sum_i P(Y=1\mid X=x_0, Z=z_i)}.\]
Furthermore, we also compute conditional probabilities \[ P(Z_j \mid X, Z_{-j})\] which we use
later for reasoning about the strength of a possible unobserved
confounder and the associated parameters \(RR_{XU}, RR_{UY}\).
z_bin <- cmb_sel[1:5]
x_nm <- "bmi_ind"
y_nm <- "death"
k <- 5L
set.seed(1)
fold_id <- sample.int(k, nrow(dat_fin), replace = TRUE)
# predictors for outcome model (Y | X,Z)
df_y_all <- dat_fin[, -c(y_nm), with = FALSE]
y_all <- as.integer(dat_fin[[y_nm]])
# storage (accumulate held-out sums across folds)
s_y_x1 <- 0; s_y_x0 <- 0; n_tot <- 0L
s_z1_x1 <- setNames(numeric(length(z_bin)), z_bin) # sum P(Zj=1 | X=1, Z_-j)
s_z1_x0 <- setNames(numeric(length(z_bin)), z_bin) # sum P(Zj=1 | X=0, Z_-j)
s_z0_x1 <- setNames(numeric(length(z_bin)), z_bin) # sum P(Zj=0 | X=1, Z_-j)
s_z0_x0 <- setNames(numeric(length(z_bin)), z_bin) # sum P(Zj=0 | X=0, Z_-j)
s_y_x1_z1 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=1,Zj=1,Z_-j]
s_y_x1_z0 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=1,Zj=0,Z_-j]
s_y_x0_z1 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=0,Zj=1,Z_-j]
s_y_x0_z0 <- setNames(numeric(length(z_bin)), z_bin) # sum E[Y | X=0,Zj=0,Z_-j)
# cross-validated approach
for (fold in seq_len(k)) {
idx_te <- which(fold_id == fold)
idx_tr <- which(fold_id != fold)
n_k <- length(idx_te)
n_tot <- n_tot + n_k
# outcome model: fit on train, predict risks on held-out under do(X=1) and do(X=0)
mod_y <- cv_xgb(df_y_all[idx_tr], y_all[idx_tr])
df_te <- copy(df_y_all[idx_te])
p_x1 <- pred_xgb(mod_y, df_te, intervention = 1L, X = x_nm)
p_x0 <- pred_xgb(mod_y, df_te, intervention = 0L, X = x_nm)
s_y_x1 <- s_y_x1 + sum(p_x1) # sum_i \hat P(Y=1 | do(X=1), Z_i) over held-out i
s_y_x0 <- s_y_x0 + sum(p_x0) # sum_i \hat P(Y=1 | do(X=0), Z_i) over held-out i
# per-confounder models / toggles
for (zj in z_bin) {
# selection model: Zj | X, Z_-j (fit on train, predict on held-out)
df_x_all <- dat_fin[, c(x_nm, setdiff(names(dat_fin), c(y_nm, zj, x_nm))), with = FALSE]
y_z_all <- as.integer(dat_fin[[zj]])
mod_z <- cv_xgb(df_x_all[idx_tr], y_z_all[idx_tr])
df_x_te <- copy(df_x_all[idx_te])
q1 <- pred_xgb(mod_z, df_x_te, intervention = 1L, X = x_nm) # \hat P(Zj=1 | X=1, Z_-j)
q0 <- pred_xgb(mod_z, df_x_te, intervention = 0L, X = x_nm) # \hat P(Zj=1 | X=0, Z_-j)
s_z1_x1[zj] <- s_z1_x1[zj] + sum(q1)
s_z1_x0[zj] <- s_z1_x0[zj] + sum(q0)
s_z0_x1[zj] <- s_z0_x1[zj] + sum(1 - q1) # \hat P(Zj=0 | X=1, Z_-j)
s_z0_x0[zj] <- s_z0_x0[zj] + sum(1 - q0) # \hat P(Zj=0 | X=0, Z_-j)
# outcome strength: Y | X, Z. Using the *same* mod_y, toggle (X, Zj) on held-out.
df1 <- copy(df_te); df1[[x_nm]] <- 1L; df1[[zj]] <- 1L
df0 <- copy(df_te); df0[[x_nm]] <- 1L; df0[[zj]] <- 0L
s_y_x1_z1[zj] <- s_y_x1_z1[zj] + sum(pred_xgb(mod_y, df1))
s_y_x1_z0[zj] <- s_y_x1_z0[zj] + sum(pred_xgb(mod_y, df0))
df1 <- copy(df_te); df1[[x_nm]] <- 0L; df1[[zj]] <- 1L
df0 <- copy(df_te); df0[[x_nm]] <- 0L; df0[[zj]] <- 0L
s_y_x0_z1[zj] <- s_y_x0_z1[zj] + sum(pred_xgb(mod_y, df1))
s_y_x0_z0[zj] <- s_y_x0_z0[zj] + sum(pred_xgb(mod_y, df0))
}
}We first compute the TRR, and the E-value:
## [1] 1.084788
## [1] 1.388064
To explain away the effect, an unobserved confounder \(U\) would have to (i) incurr a 1.39-fold increase in the risk ratio for either \(U=1\) or \(U=0\) from a \(x_0 \to x_1\) transition; (ii) be associated with a 1.39-fold increase in the risk-ratio for \(Y\) from a \(U=0 \to U=1\) or a \(U=1 \to U=0\) change.
We next try to get a sense of how large the values \(RR_{XU}, RR_{UY}\) may be in practice. For this, we proceed as follows. For each \(Z_j \in \{Z_1, \dots, Z_5\}\), we imagine that \(Z_j\) is the unobserved confounder, and we quantify the values of \(RR_{XZ_j}, RR_{Z_jY}\). Specifically, we compute the following values:
\[ \frac{\sum_{z_{-j}}P(Z_j = z_j \mid X=x_1, Z_{-j}=z_{-j})P(Z_{-j}=z_{-j})}{\sum_{z_{-j}}P(Z_j = z_j \mid X=x_0, Z_{-j}=z_{-j})P(Z_{-j}=z_{-j})} \] for \(z_j \in \{0, 1\}\); this quantifies the increase in the risk-ratio for \(Z_j=z_j\) from a \(x_0 \to x_1\) transition, averaged across \(Z_{-j}\) values (note that we are not taking the worst-case value here).
Furthermore, similarly we compute: \[\begin{align} \frac{\sum_{z_{-j}} P(Y=1 \mid X=x, Z_j = z_j, Z_{-j} = z_{-j})}{\sum_{z_{-j}} P(Y=1 \mid X=x, Z_j = 1-z_j, Z_{-j} = z_{-j})} \end{align}\] for \(z_j \in \{0, 1\}, x \in \{0, 1\}\). This allows us to quantify the largest increase in the risk-ratio for \(Y=1\) from a \(Z_j = 0 \to Z_j = 1\) or a \(Z_j = 1 \to Z_j = 0\) transition across values of \(X=x\), averaged across \(Z_{-j}\) values (note that we are not taking the worst-case value here). Intuitively, this corresponds to the \(RR_{Z_jY}\) parameter.
# RR_{X Zj}: how much Zj's distribution changes across treatment groups (X->Z_j strength)
rr_xz <- vapply(z_bin, function(zj) {
rr1 <- (s_z1_x1[zj] / n_tot) / (s_z1_x0[zj] / n_tot) # Zj=1 contrast across X
rr0 <- (s_z0_x1[zj] / n_tot) / (s_z0_x0[zj] / n_tot) # Zj=0 contrast across X
max(rr1, rr0)
}, numeric(1))
# RR_{Y Zj | X}: how much toggling Zj changes risk at fixed X (Z_j->Y strength)
rr_yz <- vapply(z_bin, function(zj) {
rr_x1 <- (s_y_x1_z1[zj] / n_tot) / (s_y_x1_z0[zj] / n_tot) # within X=1
rr_x0 <- (s_y_x0_z1[zj] / n_tot) / (s_y_x0_z0[zj] / n_tot) # within X=0
max(rr_x1, rr_x0, 1/rr_x1, 1/rr_x0)
}, numeric(1))
# inverse bias factor: (r_xz+r_yz-1)/(r_xz*r_yz)
uc_sum <- data.table(conf = z_bin, rr_xz = rr_xz, rr_yz = rr_yz)
uc_sum[, inv_B := (rr_xz + rr_yz - 1) / (rr_xz * rr_yz)]
uc_sum[, lower_bound := inv_B * trr_cv]
setorder(uc_sum, -rr_xz)
uc_sumThe summary table is informative. An unobserved confounder approximately as strong as metastatic cancer would explain away the effect we are seeing. However, a UC as strong as other confounders (fluids & electrolytes disorder, coagulopathy, arrythmia, liver disease) would not explain away the effect we are seeing.
In the VAHCS study, a cohort of adolescents was tracked over time, in order to investigate the impact of marijuana usage on mental health.
load(file.path(root, "data/vahcs/data/vahcs_data.rda"))
dat <- dat[complete.cases(dat)]
vars_fact <- setdiff(names(dat), c("mhs"))
for (var in vars_fact) dat[, c(var) := as.numeric(get(var)) - 1]
# VAHCS study data
head(dat)Variable thc encodes whether the individuals use
marijuana. The outcome mhs represents the Clinical
Interview Schedule-Revised (CIS-R) score, which is standardized and
log-transformed. Higher values of mhs represent worse
mental health. To investigate the impact of marijuana, wee first regress
the outcome onto all other variables.
##
## Call:
## lm(formula = mhs ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2997 -0.5377 0.1730 0.6472 2.5517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.51264 0.06203 -8.264 8.52e-16 ***
## pari 0.04423 0.07741 0.571 0.5680
## pard 0.11451 0.09667 1.184 0.2367
## asb 0.11776 0.12953 0.909 0.3636
## cmd 0.72994 0.07589 9.618 < 2e-16 ***
## alc 0.01921 0.08652 0.222 0.8244
## thc 0.28744 0.14052 2.046 0.0412 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9108 on 621 degrees of freedom
## Multiple R-squared: 0.1728, Adjusted R-squared: 0.1648
## F-statistic: 21.61 on 6 and 621 DF, p-value: < 2.2e-16
As we can see, when assuming linearity, the effect of
thc increases mhs by NA. We next investigate
how this finding could be affected by possible unobserved confounding.
We start by extracting the t-value and the degrees of freedom, and then
computing
This further allows us to compute the robustness values that ensures that the relative bias is less than 100%, that is, the maximal value \(r^2\) of
\[ R^2_{Y\sim U\mid Z} = R^2_{X\sim U\mid Z} = r^2 \]
such that the effect estimate does not change sign. We compute the robustness value explicitly
## [1] 0.07878507
Therefore, a confounder with $ R^2_{YUZ} = R^2_{XUZ} > 0.08$ could
possibly change the sign of our effect estimate. We verify our
computation by using the sensmakr R-package:
## [1] 0
## Parameters: q = 1, alpha = 1, invert = FALSE
In the final step, we check the values obtained for $ R^2_{YUZ} = R^2_{XUZ} > 0.08$ when leaving out the observed confounders one-by-one.
x_nm <- "thc"
y_nm <- "mhs"
x <- dat[[x_nm]]
vars_z <- setdiff(names(dat), c(y_nm, x_nm))
loco <- data.table(
z_drop = vars_z,
r2_xu = NA_real_,
r2_yu = NA_real_
)
for (i in seq_along(vars_z)) {
zj <- vars_z[i]
z_oth <- setdiff(vars_z, zj)
# --- R^2_{X ~ Z_j | Z_-j}
fml_xz <- as.formula(paste0("x ~ ", paste(z_oth, collapse = " + ")))
fml_uz <- as.formula(paste0(zj, " ~ ", paste(z_oth, collapse = " + ")))
x_res <- resid(lm(fml_xz, data = dat))
u_res <- resid(lm(fml_uz, data = dat))
loco[i, r2_xu := summary(lm(u_res ~ x_res))$r.squared]
# R^2_{Y ~ Z_j | X, Z_-j}
fml_yz <- as.formula(paste0(y_nm, " ~ ", x_nm, " + ", paste(z_oth, collapse = " + ")))
fml_uz2 <- as.formula(paste0(zj, " ~ ", x_nm, " + ", paste(z_oth, collapse = " + ")))
y_res <- resid(lm(fml_yz, data = dat))
u_res2 <- resid(lm(fml_uz2, data = dat))
loco[i, r2_yu := summary(lm(y_res ~ u_res2))$r.squared]
}
# compute the bias factor
loco[, bias_fact := sqrt( r2_xu * r2_yu / (1 - r2_xu))]
loco[, `:=`(r2_xu = round(r2_xu, 4), r2_yu = round(r2_yu, 4))]
setorder(loco, -bias_fact)
print(loco)## z_drop r2_xu r2_yu bias_fact
## <char> <num> <num> <num>
## 1: cmd 0.0037 0.1297 0.0218990925
## 2: asb 0.0782 0.0013 0.0106187062
## 3: pard 0.0314 0.0023 0.0085467924
## 4: alc 0.1176 0.0001 0.0032518810
## 5: pari 0.0001 0.0005 0.0002043617
## [1] 0.08208493
As we can see, bias factor values above 0.082 can flip our findings.
Removing any of the observed confounders does not result in bias factor
values as high as this. The largest bias factor is observed for
cmd, which captures the presence of depression/anxiety
during adolescence. As can be seen, cmd has a partial \(R^2\) value with \(Y\) larger than our robustness value, but
the \(R^2\) for \(X\) is smaller. For anti-social behavior
asb, the opposite is observed. It has a partial \(R^2\) value with \(X\) close to our robustness value, but the
partial \(R^2\) with \(Y\) is smaller. In conclusion, an
unobserved confounder would need to have larger partial \(R^2\) values than we obtain by leaving out
observed confounders.