#### "Model Dependence and Matching"
#### Harvard Gov2001 Code Library
#### Instructor: Gary King
#### Maintainers: Margaret Roberts and Brandon Stewart
#### May 1, 2011
#### Version 0.5
#### Contact: bstewart@fas.harvard.edu
# Summary: This files covers model dependence and ways to fix it.
#
# Table of Contents:
# Intro to Model Dependence
# Convex Hull and Model Fit
# Confounding variable bias and post-treatment bias
# Checking Balance
# Propensity Score Matching
# Mahalanobis Distance Matching and CEM Matching
# How to Do Genetic Matching
#### Contributors: Sorapop Kiatpongsan and Slawa Rokicki
#### April 20, 2011
#### Model Dependence Intro
#### Model Dependence: pg. 5
##########################################################################################
# We replicate some basic model dependence code from the slides #
##########################################################################################
rm(list=ls(all=TRUE))
## set seed
set.seed(990)
## Make up some data
x<-runif(40, min=1, max=2)
y<-NA
rand<-rnorm(40,mean=0, sd=1)
for(i in 1:40){
y[i]<--.3+x[i]^2+rand[i]
}
## Create linear model with predicted confidence intervals
linear.model<-lm(y~x)
newx<-seq(0,6,.1)
CI<-predict(linear.model,data.frame(x = newx), level = 0.95, interval = "confidence")
## Create quadratic model and create x and y points of quad model
quad.model<-lm(y~x+I(x^2))
summary(quad.model)
quad.x<-seq(0,6,.1)
quad.y<-quad.model$coeff[1]+quad.model$coeff[2]*quad.x+quad.model$coeff[3]*quad.x^2
## Plot the points and specify y-axis ticks
plot(x, y, xlim=c(0,6), ylim=c(-10,45),xlab="X", ylab="Y", main="Plot of Model Dependence")
axis(2, at=seq(-10,45,5),labels=T)
## add linear model and quadratic model to plot
abline(linear.model)
lines(quad.x, quad.y, lty=2)
## add confidence intervals to linear model
lines(newx, CI[,2],lty=3)
lines(newx, CI[,3], lty=3)
## Add text to plot
text(4.2, 0, "Solid: linear")
text(4.2, -2.3, "(Dotted: 95% CI)")
text(4,30, "Dashed: quadratic")
savePlot(filename = "Figs/ModelDependence.pdf", type="pdf")
#### Contributors: Allen Schmaltz and Robert Schub
#### April 20, 2011
#### Convex Hull and Model Fit
#### Model Dependence: pg. 18-20
#################################################################################
# This example makes use of the following dataset, which was provided #
# as part of Assignment 5 for Government 2001 (March 2011): #
# #
# "Ethnicity, Insurgency and Civil War," by James Fearon and David #
# Laitin (American Political Science Review. 97 (1) February 2003.) #
# #
# This code is intended to accompany slides 18-20 of Gary King's #
# lecture slides titled "Model Dependence in Counterfactual #
# Inference" (2010 version). What follows is for illustrative #
# purposes. #
# #
# Using the above dataset, we first test whether certain #
# counterfactuals are inside or outside of the convex hull. Unlike the #
# Doyle and Sambanis example provided in the lecture slides, in this #
# case there are a non-zero percentage of counterfactuals in the #
# convex hull, suggesting that the model dependence here is #
# potentially less severe than in the Doyle and Sambanis example. #
# #
# For illustrative purposes, we modify the model with one interaction #
# term, analogous to the example in the lecture slides. The graphs #
# suggest that the probabilities generated by the original and #
# modified logit models using the in-sample data seem reasonably #
# comparable. There is, however, rather more divergence when the #
# counterfactual data is used, as illustrated in the graph. #
# #
# The code is a straightforward consequence of calls to the zelig() and #
# whatif() functions. Inline comments below provide additional clarity. #
#################################################################################
rm(list=ls(all=TRUE))
library(Zelig)
library(WhatIf)
## Use Fearon and Laitin Data
## Choose "New State" as Treatment Condition
load("Gov2001CodeLibrary.RData")
dat<- fearon
## Creating proper subset of data (Table 1 Model 1)
vars<-c("onset","nwstate","warl","gdpenl","lpopl1","lmtnest","ncontig","Oil",
"instab","polity2l","ethfrac","relfrac")
dat<-dat[,vars]
dat$onset[dat$onset==4]<-1
dat<-na.omit(dat)
dim(dat)
## Convex Hull: Flip "nwstate" from 0 to 1 and 1 to 0
## Hull 1: Switch 1 to 0
a<-whatif(~warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+instab+polity2l+ethfrac+relfrac,
data=dat[dat$nwstate==0,], cfact=dat[dat$nwstate==1,])
summary(a)
## 47 out of 162 have a counterfactual in the hull
nrow(dat[dat$nwstate==1,])
## Hull 2: Switch 0 to 1
b<-whatif(~warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+instab+polity2l+ethfrac+relfrac,
data=dat[dat$nwstate==1,], cfact=dat[dat$nwstate==0,])
summary(b)
## 173 out of 6165 have a counterfactual in the hull
## Create Modified Model and Test In-Sample Fit and Counterfactual Fit
## Original Model
orig<-zelig(onset~nwstate+warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+
instab+polity2l+ethfrac+relfrac,data=dat, model="logit")
## Modified Model: Add interaction between "nwstate" and "Oil"
mod<-zelig(onset~nwstate+warl+gdpenl+lpopl1+lmtnest+ncontig+Oil+
instab+polity2l+ethfrac+relfrac+nwstate*Oil,data=dat, model="logit")
## In Sample Fit Chart
orig.fit<-orig$fitted.values
mod.fit<-mod$fitted.values
pdf(file="Figs/inSampleFitgraph.pdf")
plot(x=mod.fit, y=orig.fit, xlab="Probabilities from Modified Model",
ylab="Probabilities from Original Model",main="In Sample Fit",
xlim=c(0,1),ylim=c(0,1))
abline(a=0,b=1,lty=4)
dev.off()
## Counterfactual Fit Chart
## Here the coefficients calculated above for the original and modified
## models are used, but the generated counterfactual data matricies are used
## to cacluate the fitted values.
## create dataframes with the counterfactuals:
## dataCF is used to construct the data matrix for use with the original model coefficients
## dataCFmod is used to construct the data matrix for use with the modified model coefficients
datCF <- dat
datCFmod <- dat
for (rowNum in seq(1, nrow(datCF), 1)) {
if (dat[rowNum, c("nwstate")] == 0) {
datCF[rowNum, c("nwstate")] = 1
datCFmod[rowNum, c("nwstate")] = 1
#the following is for the new interaction term:
datCFmod[rowNum, 13] <- datCFmod[rowNum, c("nwstate")] * datCFmod[rowNum, c("Oil")]
} else if (dat[rowNum, c("nwstate")] == 1) {
datCF[rowNum, c("nwstate")] = 0
datCFmod[rowNum, c("nwstate")] = 0
#the following is for the new interaction term:
datCFmod[rowNum, 13] <- datCFmod[rowNum, c("nwstate")] * datCFmod[rowNum, c("Oil")]
}
}
## get rid of Y, add the intercept, and coerce the dataframe to a matrix:
datCFnew <- datCF
## set first column as intercept
datCFnew[, 1] <- 1
datCFnew <- as.matrix(datCFnew)
dim(datCFnew)
head(datCFnew)
as.matrix(orig$coefficients)
dim(as.matrix(orig$coefficients))
origCF.fit <- plogis((as.matrix(datCFnew)) %*% (as.matrix(orig$coefficients)))
## The following line is equivalent:
## origCF.fit <- 1/(1 + exp(-(as.matrix(datCFnew)) %*% (as.matrix(orig$coefficients))))
## get rid of Y, add the intercept, and coerce the dataframe to a matrix:
datCFmodNew <- datCFmod
## set first column as intercept
datCFmodNew[, 1] <- 1
datCFmodNew <- as.matrix(datCFmodNew)
dim(datCFmodNew)
head(datCFmodNew)
modCF.fit <- plogis((as.matrix(datCFmodNew)) %*% (as.matrix(mod$coefficients)))
pdf(file="Figs/counterfactualgraph.pdf")
plot(x=modCF.fit, y=origCF.fit, xlab="Probabilities from Modified Model",
ylab="Probabilities from Original Model",main="Counterfactual Prediction",
xlim=c(0,1),ylim=c(0,1))
abline(a=0,b=1,lty=4)
dev.off()
#### Contributors: James Conran, Jeremy Ferwerda & Yue Hou
#### April 20, 2011
#### Use simulated data to illustrate confounding variable bias and post-treatment bias
#### Model Dependence: pg. 21
#######################################################################################
# We will look at a data generating process with stochastic component Y~N(mu,sigma) #
# and systematic component mu = X*beta #
# X will be a matrix of independent variables, beta is a corresponding vector of #
# coefficients #
# #
# x1 will be the confounding variable - it will (by construction) be correlated #
# with both Y and x2 leaving it out of the model will therefore give biased estimates #
# of x2's causal effect #
#######################################################################################
rm(list=ls(all=TRUE))
## We will generate x1 randomly from the normal distribution with arbitrarily chosen mean
## and standard deviation
x1 = rnorm(1000,11,3)
## so each observed value of x2 will be drawn from the normal with mean depending on the
## value of x1 and arbitrarily set (at 0.9 & 5) parameters
mu_x2 = x1*0.9
N=1000
x2=c()
for(i in 1:N){
x2[i] = rnorm(1,mu_x2[i],5)
}
## R^2 in a bivariate regression of the two variables is 0.2 or thereabouts -
## this will vary on each run of this code
summary(lm(x2~x1))
## this is the data matrix for the true data generating process (the column of ones
## is for the intercept)
X = cbind(1,x1,x2)
## set true parameter values
b0 = 10
b1 = 1.5
b2 = 3
beta = c(b0,b1,b2)
## each observation of Y will have its own expected value, depending on its values of
## x1 & x2: mu_i = X_i * beta
## this is the vector of those expected values
mu = X%*%beta
## so each of our 1000 observed outcomes are drawn from a normal distribution with mean
## mu_i and standard deviation sigma
## this data generating process happens to be homoskedastic so sigma is a scalar
sigma = 6
Y=c()
N=1000
for(i in 1:N){
Y[i] = rnorm(1,mu[i],sigma)
}
## not knowing the data generating process we naively estimate the causal effect of x2
## without any control variables
summary(lm(Y~x2))
## we conclude that a one unit increase in x2 causes an increase in Y about 3.4
## (again the exact number will differ on each run)
## but this estimate suffers from omitted variable bias
## we can eliminate this bias by including x1 (the confounding variable) in the model:
summary(lm(Y~x1+x2))
## now the estimated causal effect of x2 is close to the true parameter of 3
## We can show it wasn't just luck by simulating the whole process 1000 times
biased = c()
unbiased=c()
for(i in 1:1000){
#draw x1
x1 = rnorm(1000,11,3)
#the x2s we draw depend on the x1s as well as a stochastic component
x2=c()
mu_x2 = x1*0.9
for(j in 1:N){
x2[j] = rnorm(1,mu_x2[j],5)
}
X = cbind(1,x1,x2)
#the Ys depend on x1 and x2 as well as a stochastic component
mu = X%*%beta
Y=c()
for(k in 1:N){
Y[k] = rnorm(1,mu[k],sigma)
}
#estimate x2's causal effect naively in each sample/experiment
biased[i] = lm(Y~x2)$coef[2]
unbiased[i] = lm(Y~x1+x2)$coef[3]
}
## histograms with blue line at the true parameter value and red line at the average estimate
par(mfrow=c(1,2))
hist(biased,xlim=c(2.9,3.5))
abline(v=3,col="blue")
abline(v=mean(biased),col="red")
hist(unbiased,xlim=c(2.9,3.5))
abline(v=3,col="blue")
abline(v=mean(unbiased),col="red")
savePlot(filename ="Figs/Confounding.pdf", type="pdf")
## the estimate for x1 is also close to the true parameter of 1.5
## however the estimate for x1 cannot be interpreted causally due to post-treatment bias
## we know that x1 is causally prior to x2 since it is part of the systematic component of the process generating x2
## so by controlling for x2 we control for the consequence of the cause - post-treatment bias
## when estimating the causal effect of x1 on Y we therefore should not include x2 in the model
summary(lm(Y~x1))
## this gives an estimated coefficient of about 4
## we know (because we decided) that the true causal effect of of a one unit
## change in x1 on Y is 1.5 + 0.9*3 = 4.2
## that is the direct effect of x1 (b1=1.5) on Y plus its effect through x2
## (b2 by the parameter we chose for the systematic component of x2, 0.9)
## again we can simulate this in the same way:
ptb= c()
no_ptb=c()
for(i in 1:1000){
#draw x1
x1 = rnorm(1000,11,3)
#the x2s we draw depend on the x1s as well as a stochastic component
x2=c()
mu_x2 = x1*0.9
for(j in 1:N){
x2[j] = rnorm(1,mu_x2[j],5)
}
X = cbind(1,x1,x2)
#the Ys depend on x1 and x2 as well as a stochastic component
mu = X%*%beta
Y=c()
for(k in 1:N){
Y[k] = rnorm(1,mu[k],sigma)
}
#now we think we're less naive so we control for x2 when estimating the causal effect of x1
ptb[i] = lm(Y~x1+x2)$coef[2]
#but x1 is causally prior to x2 so this produces biased estimates
#dropping x2 is a better idea
no_ptb[i] = lm(Y~x1)$coef[2]
}
## now plot the histograms
par(mfrow=c(1,2))
hist(ptb,main="Post-treatment bias",xlim=c(1.3,4.5))
abline(v=4.2,col="blue")
abline(v=mean(ptb),col="red")
hist(no_ptb, main="No post-treatment bias",xlim=c(1.3,4.5))
abline(v=4.2,col="blue")
abline(v=mean(no_ptb),col="red")
savePlot(filename ="Figs/Confounding2.pdf", type="pdf")
## the estimates from the model with only x1 are right on average
## the model with x2 controlled for is badly biased
dev.off()
#### Contributors: Hagerdal and Masterson
#### April 20, 2011
#### Checking Balance
#### Matching: No page #.
#######################################################################################
# We show how to check univariate and multivariate balance #
#######################################################################################
rm(list=ls(all=TRUE))
## Load your own data, but for this example
## we'll use Lalonde
library(MatchIt)
data(lalonde)
## Now we generalize the code
data <- lalonde
## Get acquianted with the data
dim(data)
sum(data[,1])
summary(data[,1])
data[0:10,]
data[200:210,]
## Subset the data into treated and untreated.
##
## Here, be aware that "treat" is the name
## of the treatment variable in Lalonde.
## You will probably need to change the name below
## to the name of your data's treatment variable
data0 <- data[data$treat==0,]
data1 <- data[data$treat==1,]
dim(data0)
dim(data1)
## First, let's look at a traditional measure of imbalance:
## A simple difference in means across covariates
balance1 <- mean(data1)-mean(data0) ; balance1
sdbalance1 <- sd(data1)-mean(data0) ; sdbalance1
## Now let's use the 'cem' package
## to utilize a better approach to balance:
## Difference in multivariate histograms
library(cem)
## Be sure to use the drop command to drop your
## treatment variable and dependent variables
## from the balance measure.
imbalance1 <- imbalance(group=data$treat, data=data, drop=c("treat","re78"))
imbalance1
#### Contributors: Mark Bell and Nick Miller
#### April 20, 2011
#### Propensity Score Matching
#### Matching: Page 15-17
######################################################################
# This code demonstrates how to do propensity #
# score matching on an dataset examining a #
# job training program (NSW). #
# #
# The variables in the data are as follows: #
# nsw - 1 for participants, 0 otherwise #
# age - age in years #
# educ - years of education #
# black - 1 if african American, 0 otherwise #
# hispanic - 1 if Hispanic, 0 otherwise #
# married - 1 if married, 0 otherwise #
# re74 - earnings in 1974 #
# re75 - earnings in 1975 #
# re78 - earnings in 1978 #
# u74 - 1 if unemployed in 1974, 0 otherwise #
# u75 - 1 if unemployed in 1975, 0 otherwise #
# u78 - 1 if unemployed in 1978, 0 otherwise #
# #
# nsw is the treatment; u78 and re78 are outcome variables #
######################################################################
## Necessary libraries
rm(list=ls(all=TRUE))
library(rgenoud)
library(Matching)
library(cem)
library(MatchIt)
## Load data
data(lalonde)
data <- lalonde
## Although propensity score matching can easily be
## done in a package such as MatchIt, this code goes
## through it step by step to show how it works.
## 1. Check the multivariate balance on the dataset.
## Drop the outcome variables and treatment indicator
todrop<-c("treat","re78","u78")
imbalance(group=data$treat, data=data,drop=todrop)
## 2. Run a logit regression of the treatment on explanatory
## variables
logit<-glm(treat~age+educ+black+hisp+married+re74+re75,
family=binomial(link="logit"), data=data)
summary(logit)
## 3. Generate fitted values. These estimate the probability
## of assignment to treatment for each unit based on their
## covariates
PS.est<-logit$fitted.values
## Attach the fitted values to the data
data2<-cbind(data,PS.est)
## As the graphs below show, the treatment and control groups
## have very different propensity to be assigned the treatment
par(mfrow = c(1,2))
hist(PS.est[data2$treat==1],
xlab="Propensity Score", main="Treatment")
hist(PS.est[data2$treat==0],
xlab="Propensity Score", main="Control")
dev.off()
## 4. Match the data based on the propensity score. The M
## argument is the number of control units assigned to each
## treated unit. Weight=2 specifies the mahalanobis distance
## metric. Other options available—see help file.
m<-Match(Y=data2$re78,Tr=data2$treat,X=data2$PS.est,
estimand="ATT", BiasAdjust=TRUE, Weight=2, M=1)
summary(m)
## 5. Propensity score matching can also be done directly in
## matchit
m.out<-matchit(treat~age+educ+black+hisp+married+re74+re75,
data=data, distance="logit")
## 6. Check multivariate imbalance on matched data
todrop<-c("treat","re78", "distance", "weights")
imbalance(group=match.data(m.out)$treat,
data=match.data(m.out),drop=todrop)
## 7. Repeat this procedure with the aim of increasing the
## multivariate balance between treatment and control (the L1
## statistic provided by the imbalance function provides a measure
## of this). This should be done by altering the specification
## of the logit function by (for example) including higher order
## terms or interactions.
#### Contributors: Erin Baggott and Volha Charnysh
#### April 20, 2011
#### Mahalanobis Distance Matching and CEM Matching
#### Matching: Page 15-17
#######################################################################################
# We show how to perform Mahalanobis Distance Matching and CEM Matching #
# #
# This data is a subset of Bueno de Mesquita, B. and A. Smith (2010), “The #
# Pernicious Consequences of UN Security Council Membership,” #
# Journal of Conflict Resolution 54: 667. #
# #
# A brief description of the variables in the dataset: #
# unmem - 0 if the country is not one of the 5 permament members of UNSC; 1 otherwise #
# demaut - Polity: (democracy score - autocracy score +10)/20 #
# lGDPpcWB-WDI: per capita GDP in const 2000 dollars #
# tau_lead- Tau (measure of alliance portfolio) b/w CCode and system leader. #
# lpopWB - log of population, World Bank data #
# year4SC- whether the country has had a seat on the UNSC during the years in question#
# (1 for true) #
# delta4GDPpcWB change in GDP per capita #
#######################################################################################
rm(list=ls(all=TRUE))
library(Zelig)
library(cem)
library(MatchIt)
load("Gov2001CodeLibrary.RData")
data <- BDMSmith
head(data)
## Omitting missing values from the dataset:
data<-na.omit(data)
## Examining balance on the covariates in the original dataset (multivariate and univariate):
pre.imbalance <- imbalance(group=data$year4SC, data=data, drop=c("year4SC","delta4GDPpcWB"))
pre.imbalance
## Our Multivariate Imbalance Measure: L1=0.799
## Percentage of local common support: LCS=13.0%
## As we see, the balance is not great.
## Confirming the difference in means and quantiles for age distribution
tr <- data$year4SC == 1
co <- data$year4SC == 0
mean(data$delta4GDPpcWB[tr]) - mean(data$delta4GDPpcWB[co])
quantile(data$delta4GDPpcWB[data$year4SC == 1], c(0,.25,.50,.75,1)) -
quantile(data$delta4GDPpcWB[data$year4SC==0], c(0,.25,.50,.75,1))
## Checking the graphs to understand why L1 measure is so low
par(mfrow=c(1,2))
plot(density(data$delta4GDPpcWB[tr]))
plot(density(data$delta4GDPpcW[co]))
## Using mahalanobis matching
## We use the Mahalanobis distance metric to measure the dissimilarity between two vectors
## - cases from the dataset with the specified covariates.
mahal<- matchit(formula = year4SC ~ demaut+ lpopWB+lGDPpcWB+tau_lead ,
data = data, method = "nearest", distance = "mahalanobis")
data_mah<-match.data(mahal)
post.imbalance <- imbalance(group=data_mah$year4SC, data=data_mah,
drop=c("year4SC","delta4GDPpcWB","distance","weights"))
post.imbalance$L1
## Multivariate Imbalance Measure: L1=0.275
## Percentage of local common support: LCS=61.2%
## Now let's do Mahalanobis with replacement (replace=TRUE) and with 4 matches rather than 1 (r=4):
mahal2 <- matchit(year4SC ~ demaut+lpopWB+lGDPpcWB, method = "nearest", r=4,
data = data, replace=TRUE, distance="mahalanobis")
summary(mahal2)
data_mah2<-match.data(mahal2)
post.imbalance2 <- imbalance(group=data_mah2$year4SC, data=data_mah2,
drop=c("year4SC","delta4GDPpcWB","distance","weights"))
post.imbalance2$L1
## Multivariate Imbalance Measure: L1=0.346
## Percentage of local common support: LCS=58.1%
## Estimating average effects:
z.out0 <- zelig(delta4GDPpcWB ~ demaut+lpopWB+lGDPpcWB,
data = match.data(mahal2, "control"), model = "ls")
x.out0 <- setx(z.out0, data = match.data(mahal2, "treat"), cond = TRUE)
s.out0 <- sim(z.out0, x = x.out0)
summary(s.out0)
## 3 Now let's check out caliper (the number of standard deviations of the
## distance measure within which to draw control units (default = 0, no
## caliper matching). If a caliper is specified, a control unit within the
## caliper for a treated unit is randomly selected as the match for that
## treated unit. If caliper != 0, there are two additional options:
##– calclosest: whether to take the nearest available match if no matches are
## available within the caliper (default = FALSE).
##– mahvars: variables on which to perform Mahalanobis-metric matching within
## each caliper (default = NULL). Variables should be entered as a vector of
## variable names (e.g., mahvars = c("X1", "X2")). If mahvars is specified
## without caliper, the caliper is set to 0.25.
mahal3 <- matchit(year4SC ~ demaut+lpopWB+lGDPpcWB, method = "nearest",
caliper=3, mahvars = c("demaut", "lGDPpcWB", "lpopWB"),
data = data, distance="mahalanobis")
summary(mahal3)
data_mah3<-match.data(mahal3)
post.imbalance3 <- imbalance(group=data_mah3$year4SC, data=data_mah3,
drop=c("year4SC","delta4GDPpcWB","distance","weights"))
post.imbalance3$L1
## Our results: Multivariate Imbalance Measure: L1=0.339
## Percentage of local common support: LCS=56.8%
## Matching using CEM:
## Checking for balance for the effect over four years,
## Average Treatment Effect on the Treated for delta4GDPpcWB using cem:
## Autocoarsening:
auto.match <- matchit(formula = year4SC ~ demaut+ lpopWB+lGDPpcWB+tau_lead,
data = data, method = "cem")
auto.match
auto.data <- match.data(auto.match)
auto.imb <- imbalance(group=auto.data$year4SC, data=auto.data, drop=
c("year4SC","delta4GDPpcWB","distance","weights","subclass"))
auto.imb$L1
## Now we get Multivariate Imbalance Measure: L1=0.574;
## Percentage of local common support: LCS=39.4%
lpopWBcut <- hist(data$lpopWB, plot=FALSE)$breaks
lGDPpcWBcut <- hist(data$lGDPpcWB, plot=FALSE)$breaks
my.cutpoints <- list(lpopWB= lpopWBcut, lGDPpcWB=lGDPpcWBcut)
user.match <- matchit(formula = year4SC ~ demaut+ lpopWB+lGDPpcWB, data = data,
method = "cem", cutpoints = my.cutpoints)
user.data <- match.data(user.match)
ser.imb <- imbalance(group=user.data$year4SC, data=user.data,
drop=c("year4SC","delta4GDPpcWB","distance","weights",
"subclass"))
ser.imb$L1
## Our results are much better:
## Multivariate Imbalance Measure: L1=0.531
## Percentage of local common support: LCS=48.2%
## Calculating ATT:
cem.match <- cem(treatment = "year4SC", data = data, drop = "delta4GDPpcWB")
## Simple (weighted) difference in means; e.g. linear regression
cem.match.att <- att(obj=cem.match, formula= delta4GDPpcWB ~ year4SC,
data = data, model="linear")
cem.match.att #default is linear model
## Our results:
## SATT point estimate: -0.635883 (p.value=0.616556)
## 95% conf. interval: [-3.123998, 1.852232]
## Q-Q (Quantile-Quantile) plots:
summary(auto.match)
## The first bloc of the summary output ("summary of balance for all data")
## reports the QQ summary statistics in the final three columns. These are
## the median, mean, and max distances between the treated and control groups.
## According to the matchit documentation, "Values greater than 0 indicate
## deviations between the groups in some part of the empirical distributions."
plot(auto.match)
## Running this command shows you the QQ plots.
#### Contributors: Nick Miller and Mark Bell
#### April 20, 2011
#### How to Do Genetic Matching
#### Matching: No page #
################################################################################################
# Download and install GenMatch package. Info available at http://sekhon.berkeley.edu/matching/#
# The source for most of the info below is Jasjeet Sekhon, "Multivariate and Propensity Score #
# Matching Software with Automated Balance Optimization: The Matching Package for R," Journal #
# of Statistical Software. Available at the web address above #
################################################################################################
rm(list=ls(all=TRUE))
## Load data
library(MatchIt)
data(lalonde)
data <- lalonde
## GenMatch takes as arguments a vector of the treatment variable, and matrix of Xs
## that are to be balanced. The user can either specify a matrix of individual X
## covariates, and let GenMatch do the work, or estimate a propensity score and include
## that in the balance matrix
## Creating data inputs
X <-cbind(data$age, data$educ, data$black, data$hisp, data$married,
data$re74, data$re75)
vars<-c("age", "educ", "black","hisp", "married", "re74", "re75")
colnames(X)<-vars
T<-cbind(data$treat)
Y<-cbind(data$re78)
## loading required libraries
library(Matching)
library(rgenoud)
## In addition to specifying the treatment indicator and the balance matrix (in this case, X),
## there are several other important arguments the user must specify for the GenMatch function.
## The estimand is by default the ATT (average treatment effect for the treated), but can be
## changed to ATE or ATC. The number of matches is 1 by default, but can be altered using the
## M=_ argument. The pop.size argument is particularly important, as this dictates how long the
## program takes to run and also may affect the quality of the solution. As Sekhon notes,
## "The theorems proving that genetic algorithms find good solutions are asymptotic in population
## size. Therefore, it is important that this value not be small" (p. 16). 1000 may be a good
## benchmark, although this may take a while to run. Genetic Matching uses a evolutionary algorithm
## to optimize balance, and the pop.size argument determines the complexity of the algorithm
gm<- GenMatch(Tr=T, X=X, estimand="ATT", M=1, pop.size=1000)
## Other options in GenMatch that a user may want to alter are described below.
## Consult the help file for more info:
## fit.func: determines the balance metric GenMatch optimizes.
## Default metrics are K-S tests and paired t-test p-values,
## but other options involving QQ plots are available
## exact: specifies which variables on which to match exactly
## caliper: the number of standard deviations within which matches are selected
## replace: whether matching is with or without replacement
## CommonSupport: whether observations outside common support are to be dropped
## nboots: number of bootstrap samples used to calculate KS values
## After finding the matching solution using GenMatch, the Match function is used to
## calculate the ATT estimate. The user must specify a vector of Y and weight matrix
## from the matching algorithm
matchedgm<- Match(Y = Y, Tr = T, X = X, Weight.matrix = gm)
## Checking balance is done with the MatchBalance function. The user must specify the
## treatment and covariates, as well as the matched dataset with match.out argument.
## Running 1000 bootstrap samples to calculate KS tests
MatchBalance(T ~ age + educ + black + hisp + married + re74 +re75,
data = X, match.out = matchedgm, nboots = 1000)
## summary can be used to view the ATT estimate after matching
summary(matchedgm)