#### "Model Dependence and Matching" #### Harvard Gov2001 Code Library #### Instructor: Gary King #### Maintainers: Margaret Roberts and Brandon Stewart #### May 1, 2011 #### Version 0.5 #### Contact: bstewart@fas.harvard.edu # Summary: This files covers model dependence and ways to fix it. # # Table of Contents: # Intro to Model Dependence # Convex Hull and Model Fit # Confounding variable bias and post-treatment bias # Checking Balance # Propensity Score Matching # Mahalanobis Distance Matching and CEM Matching # How to Do Genetic Matching #### Contributors: Sorapop Kiatpongsan and Slawa Rokicki #### April 20, 2011 #### Model Dependence Intro #### Model Dependence: pg. 5 ########################################################################################## # We replicate some basic model dependence code from the slides # ########################################################################################## rm(list=ls(all=TRUE)) ## set seed set.seed(990) ## Make up some data x<-runif(40, min=1, max=2) y<-NA rand<-rnorm(40,mean=0, sd=1) for(i in 1:40){ y[i]<--.3+x[i]^2+rand[i] } ## Create linear model with predicted confidence intervals linear.model<-lm(y~x) newx<-seq(0,6,.1) CI<-predict(linear.model,data.frame(x = newx), level = 0.95, interval = "confidence") ## Create quadratic model and create x and y points of quad model quad.model<-lm(y~x+I(x^2)) summary(quad.model) quad.x<-seq(0,6,.1) quad.y<-quad.model$coeff[1]+quad.model$coeff[2]*quad.x+quad.model$coeff[3]*quad.x^2 ## Plot the points and specify y-axis ticks plot(x, y, xlim=c(0,6), ylim=c(-10,45),xlab="X", ylab="Y", main="Plot of Model Dependence") axis(2, at=seq(-10,45,5),labels=T) ## add linear model and quadratic model to plot abline(linear.model) lines(quad.x, quad.y, lty=2) ## add confidence intervals to linear model lines(newx, CI[,2],lty=3) lines(newx, CI[,3], lty=3) ## Add text to plot text(4.2, 0, "Solid: linear") text(4.2, -2.3, "(Dotted: 95% CI)") text(4,30, "Dashed: quadratic") savePlot(filename = "Figs/ModelDependence.pdf", type="pdf") #### Contributors: Allen Schmaltz and Robert Schub #### April 20, 2011 #### Convex Hull and Model Fit #### Model Dependence: pg. 18-20 ################################################################################# # This example makes use of the following dataset, which was provided # # as part of Assignment 5 for Government 2001 (March 2011): # # # # "Ethnicity, Insurgency and Civil War," by James Fearon and David # # Laitin (American Political Science Review. 97 (1) February 2003.) # # # # This code is intended to accompany slides 18-20 of Gary King's # # lecture slides titled "Model Dependence in Counterfactual # # Inference" (2010 version). What follows is for illustrative # # purposes. # # # # Using the above dataset, we first test whether certain # # counterfactuals are inside or outside of the convex hull. Unlike the # # Doyle and Sambanis example provided in the lecture slides, in this # # case there are a non-zero percentage of counterfactuals in the # # convex hull, suggesting that the model dependence here is # # potentially less severe than in the Doyle and Sambanis example. # # # # For illustrative purposes, we modify the model with one interaction # # term, analogous to the example in the lecture slides. The graphs # # suggest that the probabilities generated by the original and # # modified logit models using the in-sample data seem reasonably # # comparable. There is, however, rather more divergence when the # # counterfactual data is used, as illustrated in the graph. # # # # The code is a straightforward consequence of calls to the zelig() and # # whatif() functions. Inline comments below provide additional clarity. # ################################################################################# rm(list=ls(all=TRUE)) library(Zelig) library(WhatIf) ## Use Fearon and Laitin Data ## Choose "New State" as Treatment Condition load("Gov2001CodeLibrary.RData") dat<- fearon ## Creating proper subset of data (Table 1 Model 1) vars<-c("onset","nwstate","warl","gdpenl","lpopl1","lmtnest","ncontig","Oil", "instab","polity2l","ethfrac","relfrac") dat<-dat[,vars] dat$onset[dat$onset==4]<-1 dat<-na.omit(dat) dim(dat) ## Convex Hull: Flip "nwstate" from 0 to 1 and 1 to 0 ## Hull 1: Switch 1 to 0 a<-whatif(~warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+instab+polity2l+ethfrac+relfrac, data=dat[dat$nwstate==0,], cfact=dat[dat$nwstate==1,]) summary(a) ## 47 out of 162 have a counterfactual in the hull nrow(dat[dat$nwstate==1,]) ## Hull 2: Switch 0 to 1 b<-whatif(~warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+instab+polity2l+ethfrac+relfrac, data=dat[dat$nwstate==1,], cfact=dat[dat$nwstate==0,]) summary(b) ## 173 out of 6165 have a counterfactual in the hull ## Create Modified Model and Test In-Sample Fit and Counterfactual Fit ## Original Model orig<-zelig(onset~nwstate+warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+ instab+polity2l+ethfrac+relfrac,data=dat, model="logit") ## Modified Model: Add interaction between "nwstate" and "Oil" mod<-zelig(onset~nwstate+warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+ instab+polity2l+ethfrac+relfrac+nwstate*Oil,data=dat, model="logit") ## In Sample Fit Chart orig.fit<-orig$fitted.values mod.fit<-mod$fitted.values pdf(file="Figs/inSampleFitgraph.pdf") plot(x=mod.fit, y=orig.fit, xlab="Probabilities from Modified Model", ylab="Probabilities from Original Model",main="In Sample Fit", xlim=c(0,1),ylim=c(0,1)) abline(a=0,b=1,lty=4) dev.off() ## Counterfactual Fit Chart ## Here the coefficients calculated above for the original and modified ## models are used, but the generated counterfactual data matricies are used ## to cacluate the fitted values. ## create dataframes with the counterfactuals: ## dataCF is used to construct the data matrix for use with the original model coefficients ## dataCFmod is used to construct the data matrix for use with the modified model coefficients datCF <- dat datCFmod <- dat for (rowNum in seq(1, nrow(datCF), 1)) { if (dat[rowNum, c("nwstate")] == 0) { datCF[rowNum, c("nwstate")] = 1 datCFmod[rowNum, c("nwstate")] = 1 #the following is for the new interaction term: datCFmod[rowNum, 13] <- datCFmod[rowNum, c("nwstate")] * datCFmod[rowNum, c("Oil")] } else if (dat[rowNum, c("nwstate")] == 1) { datCF[rowNum, c("nwstate")] = 0 datCFmod[rowNum, c("nwstate")] = 0 #the following is for the new interaction term: datCFmod[rowNum, 13] <- datCFmod[rowNum, c("nwstate")] * datCFmod[rowNum, c("Oil")] } } ## get rid of Y, add the intercept, and coerce the dataframe to a matrix: datCFnew <- datCF ## set first column as intercept datCFnew[, 1] <- 1 datCFnew <- as.matrix(datCFnew) dim(datCFnew) head(datCFnew) as.matrix(orig$coefficients) dim(as.matrix(orig$coefficients)) origCF.fit <- plogis((as.matrix(datCFnew)) %*% (as.matrix(orig$coefficients))) ## The following line is equivalent: ## origCF.fit <- 1/(1 + exp(-(as.matrix(datCFnew)) %*% (as.matrix(orig$coefficients)))) ## get rid of Y, add the intercept, and coerce the dataframe to a matrix: datCFmodNew <- datCFmod ## set first column as intercept datCFmodNew[, 1] <- 1 datCFmodNew <- as.matrix(datCFmodNew) dim(datCFmodNew) head(datCFmodNew) modCF.fit <- plogis((as.matrix(datCFmodNew)) %*% (as.matrix(mod$coefficients))) pdf(file="Figs/counterfactualgraph.pdf") plot(x=modCF.fit, y=origCF.fit, xlab="Probabilities from Modified Model", ylab="Probabilities from Original Model",main="Counterfactual Prediction", xlim=c(0,1),ylim=c(0,1)) abline(a=0,b=1,lty=4) dev.off() #### Contributors: James Conran, Jeremy Ferwerda & Yue Hou #### April 20, 2011 #### Use simulated data to illustrate confounding variable bias and post-treatment bias #### Model Dependence: pg. 21 ####################################################################################### # We will look at a data generating process with stochastic component Y~N(mu,sigma) # # and systematic component mu = X*beta # # X will be a matrix of independent variables, beta is a corresponding vector of # # coefficients # # # # x1 will be the confounding variable - it will (by construction) be correlated # # with both Y and x2 leaving it out of the model will therefore give biased estimates # # of x2's causal effect # ####################################################################################### rm(list=ls(all=TRUE)) ## We will generate x1 randomly from the normal distribution with arbitrarily chosen mean ## and standard deviation x1 = rnorm(1000,11,3) ## so each observed value of x2 will be drawn from the normal with mean depending on the ## value of x1 and arbitrarily set (at 0.9 & 5) parameters mu_x2 = x1*0.9 N=1000 x2=c() for(i in 1:N){ x2[i] = rnorm(1,mu_x2[i],5) } ## R^2 in a bivariate regression of the two variables is 0.2 or thereabouts - ## this will vary on each run of this code summary(lm(x2~x1)) ## this is the data matrix for the true data generating process (the column of ones ## is for the intercept) X = cbind(1,x1,x2) ## set true parameter values b0 = 10 b1 = 1.5 b2 = 3 beta = c(b0,b1,b2) ## each observation of Y will have its own expected value, depending on its values of ## x1 & x2: mu_i = X_i * beta ## this is the vector of those expected values mu = X%*%beta ## so each of our 1000 observed outcomes are drawn from a normal distribution with mean ## mu_i and standard deviation sigma ## this data generating process happens to be homoskedastic so sigma is a scalar sigma = 6 Y=c() N=1000 for(i in 1:N){ Y[i] = rnorm(1,mu[i],sigma) } ## not knowing the data generating process we naively estimate the causal effect of x2 ## without any control variables summary(lm(Y~x2)) ## we conclude that a one unit increase in x2 causes an increase in Y about 3.4 ## (again the exact number will differ on each run) ## but this estimate suffers from omitted variable bias ## we can eliminate this bias by including x1 (the confounding variable) in the model: summary(lm(Y~x1+x2)) ## now the estimated causal effect of x2 is close to the true parameter of 3 ## We can show it wasn't just luck by simulating the whole process 1000 times biased = c() unbiased=c() for(i in 1:1000){ #draw x1 x1 = rnorm(1000,11,3) #the x2s we draw depend on the x1s as well as a stochastic component x2=c() mu_x2 = x1*0.9 for(j in 1:N){ x2[j] = rnorm(1,mu_x2[j],5) } X = cbind(1,x1,x2) #the Ys depend on x1 and x2 as well as a stochastic component mu = X%*%beta Y=c() for(k in 1:N){ Y[k] = rnorm(1,mu[k],sigma) } #estimate x2's causal effect naively in each sample/experiment biased[i] = lm(Y~x2)$coef[2] unbiased[i] = lm(Y~x1+x2)$coef[3] } ## histograms with blue line at the true parameter value and red line at the average estimate par(mfrow=c(1,2)) hist(biased,xlim=c(2.9,3.5)) abline(v=3,col="blue") abline(v=mean(biased),col="red") hist(unbiased,xlim=c(2.9,3.5)) abline(v=3,col="blue") abline(v=mean(unbiased),col="red") savePlot(filename ="Figs/Confounding.pdf", type="pdf") ## the estimate for x1 is also close to the true parameter of 1.5 ## however the estimate for x1 cannot be interpreted causally due to post-treatment bias ## we know that x1 is causally prior to x2 since it is part of the systematic component of the process generating x2 ## so by controlling for x2 we control for the consequence of the cause - post-treatment bias ## when estimating the causal effect of x1 on Y we therefore should not include x2 in the model summary(lm(Y~x1)) ## this gives an estimated coefficient of about 4 ## we know (because we decided) that the true causal effect of of a one unit ## change in x1 on Y is 1.5 + 0.9*3 = 4.2 ## that is the direct effect of x1 (b1=1.5) on Y plus its effect through x2 ## (b2 by the parameter we chose for the systematic component of x2, 0.9) ## again we can simulate this in the same way: ptb= c() no_ptb=c() for(i in 1:1000){ #draw x1 x1 = rnorm(1000,11,3) #the x2s we draw depend on the x1s as well as a stochastic component x2=c() mu_x2 = x1*0.9 for(j in 1:N){ x2[j] = rnorm(1,mu_x2[j],5) } X = cbind(1,x1,x2) #the Ys depend on x1 and x2 as well as a stochastic component mu = X%*%beta Y=c() for(k in 1:N){ Y[k] = rnorm(1,mu[k],sigma) } #now we think we're less naive so we control for x2 when estimating the causal effect of x1 ptb[i] = lm(Y~x1+x2)$coef[2] #but x1 is causally prior to x2 so this produces biased estimates #dropping x2 is a better idea no_ptb[i] = lm(Y~x1)$coef[2] } ## now plot the histograms par(mfrow=c(1,2)) hist(ptb,main="Post-treatment bias",xlim=c(1.3,4.5)) abline(v=4.2,col="blue") abline(v=mean(ptb),col="red") hist(no_ptb, main="No post-treatment bias",xlim=c(1.3,4.5)) abline(v=4.2,col="blue") abline(v=mean(no_ptb),col="red") savePlot(filename ="Figs/Confounding2.pdf", type="pdf") ## the estimates from the model with only x1 are right on average ## the model with x2 controlled for is badly biased dev.off() #### Contributors: Hagerdal and Masterson #### April 20, 2011 #### Checking Balance #### Matching: No page #. ####################################################################################### # We show how to check univariate and multivariate balance # ####################################################################################### rm(list=ls(all=TRUE)) ## Load your own data, but for this example ## we'll use Lalonde library(MatchIt) data(lalonde) ## Now we generalize the code data <- lalonde ## Get acquianted with the data dim(data) sum(data[,1]) summary(data[,1]) data[0:10,] data[200:210,] ## Subset the data into treated and untreated. ## ## Here, be aware that "treat" is the name ## of the treatment variable in Lalonde. ## You will probably need to change the name below ## to the name of your data's treatment variable data0 <- data[data$treat==0,] data1 <- data[data$treat==1,] dim(data0) dim(data1) ## First, let's look at a traditional measure of imbalance: ## A simple difference in means across covariates balance1 <- mean(data1)-mean(data0) ; balance1 sdbalance1 <- sd(data1)-mean(data0) ; sdbalance1 ## Now let's use the 'cem' package ## to utilize a better approach to balance: ## Difference in multivariate histograms library(cem) ## Be sure to use the drop command to drop your ## treatment variable and dependent variables ## from the balance measure. imbalance1 <- imbalance(group=data$treat, data=data, drop=c("treat","re78")) imbalance1 #### Contributors: Mark Bell and Nick Miller #### April 20, 2011 #### Propensity Score Matching #### Matching: Page 15-17 ###################################################################### # This code demonstrates how to do propensity # # score matching on an dataset examining a # # job training program (NSW). # # # # The variables in the data are as follows: # # nsw - 1 for participants, 0 otherwise # # age - age in years # # educ - years of education # # black - 1 if african American, 0 otherwise # # hispanic - 1 if Hispanic, 0 otherwise # # married - 1 if married, 0 otherwise # # re74 - earnings in 1974 # # re75 - earnings in 1975 # # re78 - earnings in 1978 # # u74 - 1 if unemployed in 1974, 0 otherwise # # u75 - 1 if unemployed in 1975, 0 otherwise # # u78 - 1 if unemployed in 1978, 0 otherwise # # # # nsw is the treatment; u78 and re78 are outcome variables # ###################################################################### ## Necessary libraries rm(list=ls(all=TRUE)) library(rgenoud) library(Matching) library(cem) library(MatchIt) ## Load data data(lalonde) data <- lalonde ## Although propensity score matching can easily be ## done in a package such as MatchIt, this code goes ## through it step by step to show how it works. ## 1. Check the multivariate balance on the dataset. ## Drop the outcome variables and treatment indicator todrop<-c("treat","re78","u78") imbalance(group=data$treat, data=data,drop=todrop) ## 2. Run a logit regression of the treatment on explanatory ## variables logit<-glm(treat~age+educ+black+hisp+married+re74+re75, family=binomial(link="logit"), data=data) summary(logit) ## 3. Generate fitted values. These estimate the probability ## of assignment to treatment for each unit based on their ## covariates PS.est<-logit$fitted.values ## Attach the fitted values to the data data2<-cbind(data,PS.est) ## As the graphs below show, the treatment and control groups ## have very different propensity to be assigned the treatment par(mfrow = c(1,2)) hist(PS.est[data2$treat==1], xlab="Propensity Score", main="Treatment") hist(PS.est[data2$treat==0], xlab="Propensity Score", main="Control") dev.off() ## 4. Match the data based on the propensity score. The M ## argument is the number of control units assigned to each ## treated unit. Weight=2 specifies the mahalanobis distance ## metric. Other options available—see help file. m<-Match(Y=data2$re78,Tr=data2$treat,X=data2$PS.est, estimand="ATT", BiasAdjust=TRUE, Weight=2, M=1) summary(m) ## 5. Propensity score matching can also be done directly in ## matchit m.out<-matchit(treat~age+educ+black+hisp+married+re74+re75, data=data, distance="logit") ## 6. Check multivariate imbalance on matched data todrop<-c("treat","re78", "distance", "weights") imbalance(group=match.data(m.out)$treat, data=match.data(m.out),drop=todrop) ## 7. Repeat this procedure with the aim of increasing the ## multivariate balance between treatment and control (the L1 ## statistic provided by the imbalance function provides a measure ## of this). This should be done by altering the specification ## of the logit function by (for example) including higher order ## terms or interactions. #### Contributors: Erin Baggott and Volha Charnysh #### April 20, 2011 #### Mahalanobis Distance Matching and CEM Matching #### Matching: Page 15-17 ####################################################################################### # We show how to perform Mahalanobis Distance Matching and CEM Matching # # # # This data is a subset of Bueno de Mesquita, B. and A. Smith (2010), “The # # Pernicious Consequences of UN Security Council Membership,” # # Journal of Conflict Resolution 54: 667. # # # # A brief description of the variables in the dataset: # # unmem - 0 if the country is not one of the 5 permament members of UNSC; 1 otherwise # # demaut - Polity: (democracy score - autocracy score +10)/20 # # lGDPpcWB-WDI: per capita GDP in const 2000 dollars # # tau_lead- Tau (measure of alliance portfolio) b/w CCode and system leader. # # lpopWB - log of population, World Bank data # # year4SC- whether the country has had a seat on the UNSC during the years in question# # (1 for true) # # delta4GDPpcWB change in GDP per capita # ####################################################################################### rm(list=ls(all=TRUE)) library(Zelig) library(cem) library(MatchIt) load("Gov2001CodeLibrary.RData") data <- BDMSmith head(data) ## Omitting missing values from the dataset: data<-na.omit(data) ## Examining balance on the covariates in the original dataset (multivariate and univariate): pre.imbalance <- imbalance(group=data$year4SC, data=data, drop=c("year4SC","delta4GDPpcWB")) pre.imbalance ## Our Multivariate Imbalance Measure: L1=0.799 ## Percentage of local common support: LCS=13.0% ## As we see, the balance is not great. ## Confirming the difference in means and quantiles for age distribution tr <- data$year4SC == 1 co <- data$year4SC == 0 mean(data$delta4GDPpcWB[tr]) - mean(data$delta4GDPpcWB[co]) quantile(data$delta4GDPpcWB[data$year4SC == 1], c(0,.25,.50,.75,1)) - quantile(data$delta4GDPpcWB[data$year4SC==0], c(0,.25,.50,.75,1)) ## Checking the graphs to understand why L1 measure is so low par(mfrow=c(1,2)) plot(density(data$delta4GDPpcWB[tr])) plot(density(data$delta4GDPpcW[co])) ## Using mahalanobis matching ## We use the Mahalanobis distance metric to measure the dissimilarity between two vectors ## - cases from the dataset with the specified covariates. mahal<- matchit(formula = year4SC ~ demaut+ lpopWB+lGDPpcWB+tau_lead , data = data, method = "nearest", distance = "mahalanobis") data_mah<-match.data(mahal) post.imbalance <- imbalance(group=data_mah$year4SC, data=data_mah, drop=c("year4SC","delta4GDPpcWB","distance","weights")) post.imbalance$L1 ## Multivariate Imbalance Measure: L1=0.275 ## Percentage of local common support: LCS=61.2% ## Now let's do Mahalanobis with replacement (replace=TRUE) and with 4 matches rather than 1 (r=4): mahal2 <- matchit(year4SC ~ demaut+lpopWB+lGDPpcWB, method = "nearest", r=4, data = data, replace=TRUE, distance="mahalanobis") summary(mahal2) data_mah2<-match.data(mahal2) post.imbalance2 <- imbalance(group=data_mah2$year4SC, data=data_mah2, drop=c("year4SC","delta4GDPpcWB","distance","weights")) post.imbalance2$L1 ## Multivariate Imbalance Measure: L1=0.346 ## Percentage of local common support: LCS=58.1% ## Estimating average effects: z.out0 <- zelig(delta4GDPpcWB ~ demaut+lpopWB+lGDPpcWB, data = match.data(mahal2, "control"), model = "ls") x.out0 <- setx(z.out0, data = match.data(mahal2, "treat"), cond = TRUE) s.out0 <- sim(z.out0, x = x.out0) summary(s.out0) ## 3 Now let's check out caliper (the number of standard deviations of the ## distance measure within which to draw control units (default = 0, no ## caliper matching). If a caliper is specified, a control unit within the ## caliper for a treated unit is randomly selected as the match for that ## treated unit. If caliper != 0, there are two additional options: ##– calclosest: whether to take the nearest available match if no matches are ## available within the caliper (default = FALSE). ##– mahvars: variables on which to perform Mahalanobis-metric matching within ## each caliper (default = NULL). Variables should be entered as a vector of ## variable names (e.g., mahvars = c("X1", "X2")). If mahvars is specified ## without caliper, the caliper is set to 0.25. mahal3 <- matchit(year4SC ~ demaut+lpopWB+lGDPpcWB, method = "nearest", caliper=3, mahvars = c("demaut", "lGDPpcWB", "lpopWB"), data = data, distance="mahalanobis") summary(mahal3) data_mah3<-match.data(mahal3) post.imbalance3 <- imbalance(group=data_mah3$year4SC, data=data_mah3, drop=c("year4SC","delta4GDPpcWB","distance","weights")) post.imbalance3$L1 ## Our results: Multivariate Imbalance Measure: L1=0.339 ## Percentage of local common support: LCS=56.8% ## Matching using CEM: ## Checking for balance for the effect over four years, ## Average Treatment Effect on the Treated for delta4GDPpcWB using cem: ## Autocoarsening: auto.match <- matchit(formula = year4SC ~ demaut+ lpopWB+lGDPpcWB+tau_lead, data = data, method = "cem") auto.match auto.data <- match.data(auto.match) auto.imb <- imbalance(group=auto.data$year4SC, data=auto.data, drop= c("year4SC","delta4GDPpcWB","distance","weights","subclass")) auto.imb$L1 ## Now we get Multivariate Imbalance Measure: L1=0.574; ## Percentage of local common support: LCS=39.4% lpopWBcut <- hist(data$lpopWB, plot=FALSE)$breaks lGDPpcWBcut <- hist(data$lGDPpcWB, plot=FALSE)$breaks my.cutpoints <- list(lpopWB= lpopWBcut, lGDPpcWB=lGDPpcWBcut) user.match <- matchit(formula = year4SC ~ demaut+ lpopWB+lGDPpcWB, data = data, method = "cem", cutpoints = my.cutpoints) user.data <- match.data(user.match) ser.imb <- imbalance(group=user.data$year4SC, data=user.data, drop=c("year4SC","delta4GDPpcWB","distance","weights", "subclass")) ser.imb$L1 ## Our results are much better: ## Multivariate Imbalance Measure: L1=0.531 ## Percentage of local common support: LCS=48.2% ## Calculating ATT: cem.match <- cem(treatment = "year4SC", data = data, drop = "delta4GDPpcWB") ## Simple (weighted) difference in means; e.g. linear regression cem.match.att <- att(obj=cem.match, formula= delta4GDPpcWB ~ year4SC, data = data, model="linear") cem.match.att #default is linear model ## Our results: ## SATT point estimate: -0.635883 (p.value=0.616556) ## 95% conf. interval: [-3.123998, 1.852232] ## Q-Q (Quantile-Quantile) plots: summary(auto.match) ## The first bloc of the summary output ("summary of balance for all data") ## reports the QQ summary statistics in the final three columns. These are ## the median, mean, and max distances between the treated and control groups. ## According to the matchit documentation, "Values greater than 0 indicate ## deviations between the groups in some part of the empirical distributions." plot(auto.match) ## Running this command shows you the QQ plots. #### Contributors: Nick Miller and Mark Bell #### April 20, 2011 #### How to Do Genetic Matching #### Matching: No page # ################################################################################################ # Download and install GenMatch package. Info available at http://sekhon.berkeley.edu/matching/# # The source for most of the info below is Jasjeet Sekhon, "Multivariate and Propensity Score # # Matching Software with Automated Balance Optimization: The Matching Package for R," Journal # # of Statistical Software. Available at the web address above # ################################################################################################ rm(list=ls(all=TRUE)) ## Load data library(MatchIt) data(lalonde) data <- lalonde ## GenMatch takes as arguments a vector of the treatment variable, and matrix of Xs ## that are to be balanced. The user can either specify a matrix of individual X ## covariates, and let GenMatch do the work, or estimate a propensity score and include ## that in the balance matrix ## Creating data inputs X <-cbind(data$age, data$educ, data$black, data$hisp, data$married, data$re74, data$re75) vars<-c("age", "educ", "black","hisp", "married", "re74", "re75") colnames(X)<-vars T<-cbind(data$treat) Y<-cbind(data$re78) ## loading required libraries library(Matching) library(rgenoud) ## In addition to specifying the treatment indicator and the balance matrix (in this case, X), ## there are several other important arguments the user must specify for the GenMatch function. ## The estimand is by default the ATT (average treatment effect for the treated), but can be ## changed to ATE or ATC. The number of matches is 1 by default, but can be altered using the ## M=_ argument. The pop.size argument is particularly important, as this dictates how long the ## program takes to run and also may affect the quality of the solution. As Sekhon notes, ## "The theorems proving that genetic algorithms find good solutions are asymptotic in population ## size. Therefore, it is important that this value not be small" (p. 16). 1000 may be a good ## benchmark, although this may take a while to run. Genetic Matching uses a evolutionary algorithm ## to optimize balance, and the pop.size argument determines the complexity of the algorithm gm<- GenMatch(Tr=T, X=X, estimand="ATT", M=1, pop.size=1000) ## Other options in GenMatch that a user may want to alter are described below. ## Consult the help file for more info: ## fit.func: determines the balance metric GenMatch optimizes. ## Default metrics are K-S tests and paired t-test p-values, ## but other options involving QQ plots are available ## exact: specifies which variables on which to match exactly ## caliper: the number of standard deviations within which matches are selected ## replace: whether matching is with or without replacement ## CommonSupport: whether observations outside common support are to be dropped ## nboots: number of bootstrap samples used to calculate KS values ## After finding the matching solution using GenMatch, the Match function is used to ## calculate the ATT estimate. The user must specify a vector of Y and weight matrix ## from the matching algorithm matchedgm<- Match(Y = Y, Tr = T, X = X, Weight.matrix = gm) ## Checking balance is done with the MatchBalance function. The user must specify the ## treatment and covariates, as well as the matched dataset with match.out argument. ## Running 1000 bootstrap samples to calculate KS tests MatchBalance(T ~ age + educ + black + hisp + married + re74 +re75, data = X, match.out = matchedgm, nboots = 1000) ## summary can be used to view the ATT estimate after matching summary(matchedgm)