commit 53f6766dcffe1735e1b0f7d1a21dd52c54260877
parent ba413b4479c4faef9c3f1366a1cd98f2e2bee6fc
Author: Eamon Caddigan <eamon.caddigan@gmail.com>
Date: Mon, 16 May 2022 19:58:59 -0400
Workaround for a broken PSweight
Diffstat:
A | PSweight_issues.Rmd | | | 127 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 127 insertions(+), 0 deletions(-)
diff --git a/PSweight_issues.Rmd b/PSweight_issues.Rmd
@@ -0,0 +1,127 @@
+---
+title: "PSweight Issues"
+author: "Eamon Caddigan, Ph.D."
+date: "5/16/2022"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+PSweight 1.1.6 has issues that prevent it from completing analyses in some environments. The newest version of the package, 1.1.7, cannot currently be installed on some machines that I use because of separate dependency issues.
+
+This document examines how to replicate some of the functionality of PSweight so that data can be analyzed (specifically, propensity score weighting with overlap weights) without depending on it.
+
+I'm going to follow the example provided in the PSweight vignette.
+
+```{r libraries}
+library(PSweight)
+library(dplyr)
+library(ggplot2)
+```
+
+# Propensity Scores
+
+By default, `SumStat` and `PSweight` just use a GLM with the logistic link function.
+
+```{r propensity}
+ps.any <- Dany ~ white + maemp + as.factor(scht) + as.factor(qmab) +
+ as.factor(qmab2) + as.factor(qvab) + as.factor(qvab2) + paed_u + maed_u +
+ agepa + agema + sib_u + paed_u * agepa + maed_u * agema
+
+bal.any <- SumStat(ps.formula = ps.any, data = NCDS,
+ weight = "overlap")
+
+prop_model <- glm(ps.any, family = binomial(link = "logit"), data = NCDS)
+propensities <- predict(prop_model, type = "response")
+overlap_weights = ifelse(NCDS$Dany == 1,
+ 1 - propensities,
+ propensities)
+
+all.equal(unname(bal.any$propensity[, "1"]),
+ unname(propensities))
+```
+
+# Density and Love Plots
+
+Density plots are trivially easy.
+
+```{r density_plot}
+NCDS %>%
+ mutate(propensity = propensities,
+ Dany = as.factor(Dany)) %>%
+ ggplot(aes(propensity)) +
+ geom_density(aes(fill = Dany),
+ alpha = 0.2) +
+ theme_minimal()
+```
+
+Love plots require more work. The default balance plot uses the average standardized difference (ASD) with weighted variance, and we'll use that too.
+
+```{r love_plot}
+# Return the weighted estimate of x at z level z_i using weights w
+weighted_est <- function(z, w, x, z_i) {
+ stopifnot(z %in% c(0, 1),
+ z_i %in% z)
+ sum(w[z == z_i] * x[z == z_i]) / sum(w[z == z_i])
+}
+
+# Return the weighted difference in x using the weights w and groupings z
+weighted_diffrence <- function(z, w, x) {
+ weighted_est(z, w, x, 1) - weighted_est(z, w, x, 0)
+}
+
+asd <- function(z, w, x) {
+ # TODO: weighted variance is tricky and I don't want it to be a blocker
+ abs(weighted_difference(z, w, x)) / sqrt(NA)
+}
+```
+
+I'll come back to this (maybe)
+
+# Weighted differences
+
+The whole point of this is to look at weighted differences (and doubly robust estimators). Point estimates are easy!
+
+```{r weighted_difference_psweight}
+ato.any <- PSweight(ps.formula = ps.any, yname = "wage", data = NCDS, bootstrap = TRUE)
+
+ato.any
+```
+
+```{r diff_point_est}
+c(weighted_est(NCDS$Dany, overlap_weights, NCDS$wage, 0),
+ weighted_est(NCDS$Dany, overlap_weights, NCDS$wage, 1))
+```
+
+Variance estimates are trickier. The sandwich estimator is computationally simpler but more difficult to implement, and bootstrapping is the opposite. I'm going with the latter.
+
+```{r diff_summary}
+summary(ato.any)
+```
+
+```{r}
+diff_reps <- replicate(
+ 100,
+ {
+ n <- sample.int(nrow(NCDS), replace = TRUE)
+ prop_model <- glm(ps.any, family = binomial(link = "logit"), data = NCDS[n, ])
+ propensities <- predict(prop_model, type = "response")
+ overlap_weights = ifelse(NCDS$Dany[n] == 1,
+ 1 - propensities,
+ propensities)
+ weighted_diffrence(NCDS$Dany[n], overlap_weights, NCDS$wage[n])
+ }
+)
+
+diff_results <- c(Estimate = weighted_diffrence(NCDS$Dany, overlap_weights, NCDS$wage),
+ Std.Err = sd(diff_reps),
+ lwr = quantile(diff_reps, 0.025),
+ upr = quantile(diff_reps, 0.975),
+ p.val = mean(diff_reps < 0))
+
+diff_results
+```
+
+Pretty close, we'll use this.