commit53f6766dcffe1735e1b0f7d1a21dd52c54260877parentba413b4479c4faef9c3f1366a1cd98f2e2bee6fcAuthor:Eamon Caddigan <eamon.caddigan@gmail.com>Date:Mon, 16 May 2022 19:58:59 -0400 Workaround for a broken PSweightDiffstat:

A | PSweight_issues.Rmd | | | 127 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |

1 file changed, 127 insertions(+), 0 deletions(-)diff --git a/PSweight_issues.Rmd b/PSweight_issues.Rmd@@ -0,0 +1,127 @@ +--- +title: "PSweight Issues" +author: "Eamon Caddigan, Ph.D." +date: "5/16/2022" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +PSweight 1.1.6 has issues that prevent it from completing analyses in some environments. The newest version of the package, 1.1.7, cannot currently be installed on some machines that I use because of separate dependency issues. + +This document examines how to replicate some of the functionality of PSweight so that data can be analyzed (specifically, propensity score weighting with overlap weights) without depending on it. + +I'm going to follow the example provided in the PSweight vignette. + +```{r libraries} +library(PSweight) +library(dplyr) +library(ggplot2) +``` + +# Propensity Scores + +By default, `SumStat` and `PSweight` just use a GLM with the logistic link function. + +```{r propensity} +ps.any <- Dany ~ white + maemp + as.factor(scht) + as.factor(qmab) + + as.factor(qmab2) + as.factor(qvab) + as.factor(qvab2) + paed_u + maed_u + + agepa + agema + sib_u + paed_u * agepa + maed_u * agema + +bal.any <- SumStat(ps.formula = ps.any, data = NCDS, + weight = "overlap") + +prop_model <- glm(ps.any, family = binomial(link = "logit"), data = NCDS) +propensities <- predict(prop_model, type = "response") +overlap_weights = ifelse(NCDS$Dany == 1, + 1 - propensities, + propensities) + +all.equal(unname(bal.any$propensity[, "1"]), + unname(propensities)) +``` + +# Density and Love Plots + +Density plots are trivially easy. + +```{r density_plot} +NCDS %>% + mutate(propensity = propensities, + Dany = as.factor(Dany)) %>% + ggplot(aes(propensity)) + + geom_density(aes(fill = Dany), + alpha = 0.2) + + theme_minimal() +``` + +Love plots require more work. The default balance plot uses the average standardized difference (ASD) with weighted variance, and we'll use that too. + +```{r love_plot} +# Return the weighted estimate of x at z level z_i using weights w +weighted_est <- function(z, w, x, z_i) { + stopifnot(z %in% c(0, 1), + z_i %in% z) + sum(w[z == z_i] * x[z == z_i]) / sum(w[z == z_i]) +} + +# Return the weighted difference in x using the weights w and groupings z +weighted_diffrence <- function(z, w, x) { + weighted_est(z, w, x, 1) - weighted_est(z, w, x, 0) +} + +asd <- function(z, w, x) { + # TODO: weighted variance is tricky and I don't want it to be a blocker + abs(weighted_difference(z, w, x)) / sqrt(NA) +} +``` + +I'll come back to this (maybe) + +# Weighted differences + +The whole point of this is to look at weighted differences (and doubly robust estimators). Point estimates are easy! + +```{r weighted_difference_psweight} +ato.any <- PSweight(ps.formula = ps.any, yname = "wage", data = NCDS, bootstrap = TRUE) + +ato.any +``` + +```{r diff_point_est} +c(weighted_est(NCDS$Dany, overlap_weights, NCDS$wage, 0), + weighted_est(NCDS$Dany, overlap_weights, NCDS$wage, 1)) +``` + +Variance estimates are trickier. The sandwich estimator is computationally simpler but more difficult to implement, and bootstrapping is the opposite. I'm going with the latter. + +```{r diff_summary} +summary(ato.any) +``` + +```{r} +diff_reps <- replicate( + 100, + { + n <- sample.int(nrow(NCDS), replace = TRUE) + prop_model <- glm(ps.any, family = binomial(link = "logit"), data = NCDS[n, ]) + propensities <- predict(prop_model, type = "response") + overlap_weights = ifelse(NCDS$Dany[n] == 1, + 1 - propensities, + propensities) + weighted_diffrence(NCDS$Dany[n], overlap_weights, NCDS$wage[n]) + } +) + +diff_results <- c(Estimate = weighted_diffrence(NCDS$Dany, overlap_weights, NCDS$wage), + Std.Err = sd(diff_reps), + lwr = quantile(diff_reps, 0.025), + upr = quantile(diff_reps, 0.975), + p.val = mean(diff_reps < 0)) + +diff_results +``` + +Pretty close, we'll use this.