# Section 9 # Matching # Many thanks to Iain Osgood # two packages we will use install.packages("MatchIt") install.packages("cem") library(MatchIt) library(cem) library(Zelig) ############ ### Data ### ############ ## LOAD DATA: Lalonde (1986) dataset; from cem package data(LL) head(LL) ?LL nrow(LL) summary(LL) # Data: # From job training program to increase earnings of program participants # Outcome: (post-treatment) earnings in 1978 (re78) # Treatment: # Participation in the job training program (treated = 1 or 0) # Non-random # Pretreatment covariates # age (age) # years of education (education) # marital status (married) # lack of a high school diploma (nodegree) # race (black, hispanic) # unemployed in 1974 (u74) and 1975 (u75) # earnings in 1974 (re74) and 1975 (re75) #Goal: # Effect of Program Participation on Earnings in 1978 #Approach: # Use matching to control pretreatment covariates # Make the treatment quasi-random # the unadjusted average treatment effect mean(LL\$re78[LL\$treated == 1]) - mean(LL\$re78[LL\$treated == 0]) summary(lm(re78 ~ treated + age + education + black + married + nodegree + re74 + re75 + hispanic + u74 + u75, data = LL)) ######################## ### Balance Checking ### ######################## ## PRE MATCH: check balance (multivariate and univariate) # imbalance() is in cem package pre.imbalance <- imbalance(group=LL\$treated, data=LL, drop=c("treated","re78")) pre.imbalance # confirm difference in means and quantiles for age distribution tr <- LL\$treated == 1 co <- LL\$treated == 0 mean(LL\$age[tr]) - mean(LL\$age[co]) quantile(LL\$age[], c(0,.25,.50,.75,1)) - quantile(LL\$age[LL\$treated ==0], c(0,.25,.50,.75,1)) # demonstrate why L1 for age is so low par(mfrow=c(1,2)) plot(density(LL\$age[tr])) plot(density(LL\$age[co])) # LCS: Local Common Support (LCS) measure, which is the proportion # of non-empty k-dimensional cells of the histogram which contain at # least one observation for treateds and controls. # what looks bad here? # overall multivariate balance is quite bad # imbalance for re34 and re75 is significant in means and quantiles # imbalance for age is significant in quantiles # Perfect balance on marginal distribution of each variable # does not mean perfect balance on joint distribution of all variables ############### ### MatchIt ### ############### #################### ## Exact Matching ## #################### ## EXACT MATCHING: Processing # Simplest version of matching # Process: # Matches each T=1 to all possible T=0 with exactly equal Xs # Result: # Perfect Balance: X values are guaranteed to be exactly the same # Drawback: # Seeks to minimize bias without regard to variance # Can also create bias in ATT if drop many treated units exact.match <- matchit(formula= treated ~ age + education + black + married + nodegree + re74 + re75 + hispanic + u74 + u75, data = LL, method = "exact") exact.match ## EXACT MATCHING: Analysis # How doe we find the treatment effect? # No remaining imbalance (e.g. exact matching) -> simple (weighted) diff in means # Still imbalance -> run statistical model including covariates # Why do we know that option one will be fine? # the match.data function converts the matchit output into # a useable data.frame, by dropping unmatched units and adding in # new information like weights, subclasses or distance measures. exact.data <- match.data(exact.match) head(exact.data) summary(exact.data) # note that everyone with a positive dollar income in # 1974 or 1975 has been dropped because they couldn't # be matched exactly. Other covariates don't vary too. # Is this the population we are interested in? exact.model <- lm(re78 ~ treated, data = exact.data, weights=exact.data\$weights) exact.data[exact.data\$subclass == 5,] # why do we need weights? # There are different numbers of treated and control units # matched in different groups, so the first thing we do # is give all treated units weight of 1, and all control # units are given weight n_ti/n_ci where n_ti is the number # of treated units in group i. Thus, if there are # more control units, you downweight each one # next, if matching is with replacment, each control unit's # weight is added up across the number of groups # in which it was matched (each match is only a member # of one group with exact matching, though) # finally, the control group weights are scaled to sum to the # number of uniquely matched control outcomes, so that the overall # weight of the two groups corresponds to the total amount # of information they provide. # so what is the treatment effect? summary(exact.model) # what about the complete model post exact matching? lm(re78 ~ treated+ age + education + black + married + nodegree + re74 + re75 + hispanic + u74 + u75, data = exact.data, weights=exact.data\$weights) # note that we could also get the same answer by calculating # the appropriate weighted average y.treat <- weighted.mean(exact.data\$re78[exact.data\$treated == 1], exact.data\$weights[exact.data\$treated == 1]) y.cont <- weighted.mean(exact.data\$re78[exact.data\$treated == 0], exact.data\$weights[exact.data\$treated == 0]) y.treat-y.cont ###################### ## Nearest Neighbor ## ###################### ## NEAREST NEIGHBOR MATCHING: Processing # Default method for matchit() # Procedure: # matches each T=1 to one "close" T=0 # 1-1 matching = default # 1-k allowed with ratio argument # specify the metric using the distance argument # default is "logit" # distance = the propensity score (i.e. PR(T|X)) # other distance measures include: # other binomial glms for pscore estimation # mahalanobis: # distance in multidimensional space # considers covariance in Xs unlike euclidean distance # d(Ti, Cj) = ( t(X_Ti-X_Cj) VCV(X_Cj)^(-1) (X_Ti-X_Cj) )^1/2 # Ti matched to Cj with the min d(Ti,Cj) # additional arguments include: # match with replacement # caliper: tolerance criteria limiting distance (radius) within which C is chosen nearest.match <- matchit(formula = treated ~ age + education + black + married + nodegree + re74 + re75 + hispanic + u74 + u75, data = LL, method = "nearest", distance = "logit") summary(nearest.match) plot(nearest.match) pre.balance <- summary(nearest.match)\$sum.all post.balance <- summary(nearest.match)\$sum.matched plot(x = (pre.balance[,1] - pre.balance[,2])/pre.balance[,3], y = 1:nrow(pre.balance), pch = 19, col = "burlywood2", xlab = "Standardized Imbalance", axes = FALSE, xlim = c(-.25,.25), ylab = "") points(x = (post.balance[,1] - post.balance[,2])/post.balance[,3], y = 1:nrow(post.balance), pch = 19, col = "cadetblue2") abline(v = 0, lty = "dashed") axis(1) axis(2, at = 11:1, labels = rownames(pre.balance), las = 1, cex.axis = .8) legend("topright", legend = c("Pre-match","Post-match"), pch = c(19,19), col = c("burlywood2","cadetblue2"), cex = .8) #summary() provides: # means # sd of control group # mean differences # comparison of quantiles # the matched call # the number of units matched, unmatched, or discarded # the percent improvement in balance for each measure of balance # (smaller values indicate better balance) ## NEAREST NEIGHBOR MATCHING: Analysis # extract matched data nearest.data <- match.data(nearest.match) head(nearest.data) ## non-parametric estimate of the ATT mean(nearest.data\$re78[nearest.data\$treated == 1]) - mean(nearest.data\$re78[nearest.data\$treated == 0]) ## A model-based estimate of the ATT nearest.model <- lm(re78 ~ treated + age + education + black + married + nodegree + re74 + re75 + hispanic + u74 + u75, data = nearest.data) summary(nearest.model) ######### ## CEM ## ######### ### CEM MATCH # cem is also implemented in MatchIt # Procedure: # exact matching on coarsened data to determine matches # first, define strata s.t. each has identical values for all coarsened pre-treatment X # second, sort all observations into strata # third, discard all C observations in any stratum without at least 1 T # Simultaneously, cem throws out T's that consitute extreme counterfactuals # by restricting all data to areas of common support. # Step 1 requires specifying a degree of coarsening for each covariate # Fully automated procedure or # Choose coarsening ourselves (better) # set coarsening s.t. substantively indistinguishable values grouped together # Note: if you would like to reintegrate dropped T's, you can do so # by imputing some counterfactuals C's using an appropriate model, # but this will generate model dependence. ## cem match: automatic bin choice auto.match <- matchit(formula = treated ~ age + education + black + married + nodegree + re74 + re75 + hispanic + u74 + u75, data = LL, method = "cem") auto.match ## cem match: user chosen coarsening re74cut <- hist(LL\$re74, plot=FALSE)\$breaks re75cut <- seq(0, max(LL\$re75)+1000, by=1000) agecut <- c(20.5, 25.5, 30.5,35.5,40.5) my.cutpoints <- list(re75=re75cut, re74=re74cut, age=agecut) user.match <- matchit(formula = treated ~ age + education + black + married + nodegree + re74 + re75 + hispanic + u74 + u75, data = LL, method = "cem", cutpoints = my.cutpoints) user.data <- match.data(user.match) auto.data <- match.data(auto.match) user.imb <- imbalance(group=user.data\$treated, data=user.data, drop= c("treated","re78","distance","weights","subclass")) auto.imb <- imbalance(group=auto.data\$treated, data=auto.data, drop= c("treated","re78","distance","weights","subclass")) names(user.imb) user.imb\$L1 # user.match has better balance auto.imb\$L1 # so less bias! summary(user.match)\$nn # user.match has less data summary(auto.match)\$nn # so more variance! # numerical vs categorical variables # coarsening for numerical variables -> cutpoints arugment # coarsening for categorical variables -> grouping argument ### CAUSAL EFFECTS w/ CEM ## for the moment the att() function does not appear to be included ## in the MatchIt library, so calculating an att with the full ## set of treated units requires using cem() in the cem package cem.match <- cem(treatment = "treated", data = LL, drop = "re78") ## Simple (weighted) difference in means; e.g. linear regression cem.match.att <- att(obj=cem.match, formula=re78 ~ treated, data = LL, model="linear") cem.match.att # default is linear model ## Remaining imbalance? -> model with covariates cem.match.att2 <- att(obj=user.match.1, formula=re78 ~ treated + age+education+black+married+nodegree+re74+re75+hispanic+u74+u75, data = LL, model="linear") cem.match.att2 ############### ###############