#### "Single Equation Models"
#### Harvard Gov2001 Code Library
#### Instructor: Gary King
#### Maintainers: Margaret Roberts and Brandon Stewart
#### May 1, 2011
#### Version 0.5
#### Contact: bstewart@fas.harvard.edu
# Summary: This file covers single equation models.
#
# Table of Contents:
# Show how the logit graph varies with xbeta
# Generate data for probit via a latent variable
# Why we simulate from the multivariate normal
# Simulation of Predicted Values and Expected Values
# Example of Simulating First Differences
# Reparameterizing on an Unbounded Scale and First Differences
# Graph on Confidence Intervals
# Ternary Plots in R
# Grouped uncorrelated binary variables model
# Variance in Poisson Models
# Logit vs. Probit
# Exponential Models
# Weibull Models
# Cox Proportional Hazards Models
# Zero-Inflated Negative Binomial Regression
#### Contributors: Ariel White, Ben Schneer, Michael Gill
#### April 20, 2011
#### Show how the logit graph varies with xbeta
#### Single-Equation Models: pg. 2
##########################################################################################
# Attribution: Thanks to Andrew Gelman: #
# http://www.stat.columbia.edu/~gelman/bugs.course/logit/logit.R #
# #
# In this exercise, we'll look at how the graph of the logit curve varies with xbeta. #
# We'll generate different datasets, fit logit models to them, #
# and plot the corresponding curves. #
##########################################################################################
rm(list=ls(all=TRUE))
library(mvtnorm)
invlogit <- function(x){
1 / (1 + exp(-x))
}#setting up a function that will help us generate data
## so, let's generate some data with some parameters:
x <- runif(100, -6, 6)
y <- rbinom (100, 1, invlogit(x))
pdf(file="Figs/logitXB.pdf")#this starts making a pdf of the figure
plot(x,y, col="white", main="Logit Curves") #this just sets up the plot; we'll add curves later
## now let's fit a logit to that, and plot the curve.
logit.R <- glm (y ~ x, family=binomial(link="logit"))
curve (invlogit (logit.R$coef[1] + logit.R$coef[2]*x),add=TRUE)
## if we do this with different values of x, we get different shapes:
x1 <- runif(100, -5, 5)
y1 <- rbinom (100, 1, invlogit(x1))
logit.1 <- glm (y1 ~ x1, family=binomial(link="logit"))
curve (invlogit (logit.1$coef[1] + logit.1$coef[2]*x),add=TRUE)
x2 <- runif(100, 0, 10)
y2 <- rbinom (100, 1, invlogit(x2))
logit.2 <- glm (y2 ~ x2, family=binomial(link="logit")) #Note this throws an error to make a point.
curve (invlogit (logit.2$coef[1] + logit.2$coef[2]*x),add=TRUE)
dev.off() #end pdf
## we can change x around, and see how this changes the curve.
#### Contributors: Ariel White, Ben Schneer, Michael Gill
#### April 20, 2011
#### Generate data for probit via a latent variable
#### Single-Equation Models: pg. 9
#################################################################################################
# Show how to generate binary data that could be modeled as a probit using y* as a #
# continuous variable: #
# Like the logit model, the probit model is used with binary outcome variables. In this example,#
# our task is to simulate y* as a function of some covariate, then dichotomize it according to #
# that story, and show that probit recovers the answer. #
# #
# Imagine we had a dataset of 10000 random individuals with information about their height, #
# whether they are of high or low income, but their true income is onobserved (y*). To show the #
# latent variable recovery of the probit, consider the following: #
# We generate a y* using simulated data with some covariates beta and some X's: #
#################################################################################################
rm(list=ls(all=TRUE))
N <- 10000
set.seed(02138)
height <- runif(N, 5.5, 6.25) #Sample of 10000 peoples' heights
hist(height, xlab="Height (in feet)", main="Frequency of Heights of Individuals in Sample (N=10000)")
set.seed(02138)
epsilon <- rnorm(N, 0, 10000)
y.star <- 10000*height + epsilon #A person's income as a function of their height
hist(y.star)
highincome <- ifelse(y.star>60000,1,0)
## Here we set a cutoff point according to if their income is higher than 60000 dollars a year.
## Our merged dataset:
data <- as.data.frame(cbind(y.star, highincome, height))
head(data)
probit.reg <- glm(highincome ~ height, data=data, family=binomial(link="probit"))
summary(probit.reg)
## Note: We see the intercept returned is proportionate to the cut-off point we
## set for high-income earners;
## We also see that the coefficient on hight is proportionate to the true coefficient on height used to
## generate y*.
#### Contributors: Ariel White, Ben Schneer, Michael Gill
#### April 20, 2011
#### Why we simulate from the multivariate normal
#### Single-Equation Models: pg. 18
############################################################################
# Suppose we ran a bivariate regression of y on x1 and x2 (w/ no constant) #
# and recovered the estimates beta #
# with covariance matrix var, estimated variance sigma2 #
# and N = 200 #
# We want to predict the outcome y when x1=1 and x2=1 #
############################################################################
rm(list=ls(all=TRUE))
library(MASS)
beta<-c(5,-10)
var<-matrix(c(10,.5,.5,4),nrow=2,ncol=2)
sigma2<-10
N<-200
S<-sigma2*(N-3)
## Part A. Simulating y the proper way
## Draws from multivariate normal
draws<-mvrnorm(1000,beta,sqrt(var))
sdraws<-sqrt(S/rchisq(1000, N-3, ncp=0))
X<-c(1,1)
sims<-draws%*%X
y<-sapply(sims,rnorm,n=1,sd=sdraws)
## Part B: Not using Multivariate normal
draws<-cbind(rnorm(1000,beta[1],var[1,1]),rnorm(1000,beta[2],var[2,2]))
sims<-draws%*%X
y2<-sapply(sims,rnorm,n=1,sd=sdraws)
## Comparison of predicted distribution of y
## Notice the inflated variance of the draws from the wrong distribution
## because of the omission of the covariances
pdf("Figs/SimulateMVNvsNorm.pdf",width=11,height=8.5)
plot(density(y),col="red",xlim=c(-30,30),main="Draws from MV Normal vs. Normal")
lines(density(y2),col="dodgerblue")
legend("topright",legend=c("MVN","N"),lty=c(1,1),col=c("red","dodgerblue"))
dev.off()
#### Contributors: Aditya Dasgupta and Sean Ingham
#### April 20, 2011
#### Simulation of Predicted Values and Expected Values
#### Single-Equation Models: pg. 20
############################################################################
# This example will use data from Besley, Persson and #
# Sturm (2010), "Political Competition, Policy and Growth", which #
# examines the effect of political competition on economic #
# policies in US states over time. #
############################################################################
rm(list=ls(all=TRUE))
## We will also need to simulate random draws from a multivariate
## normal distribution
library(mvtnorm)
## Load Data
load("Gov2001CodeLibrary.RData")
data <- BesleyEtAl
## Let's replicate column 1 from table 2 of the article. This is an
## OLS regression of state tax revenue as a share of state income
## on political competition, with 'state and year fixed effects' (dummy
## variables for each state and for each year, save a year and a state
## reference category). Note that for the purposes of this example
## we will use the ordinary OLS standard errors, as opposed to the SEs
## adjusted for clustering, as in the article.
lm1 <- lm(share_taxes_inc ~ compnorm + as.factor(state) + as.factor(year),
data = data)
lm1$coefficients[2]; sqrt(vcov(lm1)[2,2])
## Note: the coefficient is the same as in the article, but the
## standard error is different because we have not adjusted for clustering!
## This is to keep the example simple. But keep in mind that the decision
## whether to to adjust standard errors or not impacts the predicted values
## and expected values, since standard error adjustments affect the
## variance-covariance matrix, and thus also our estimation uncertainty.
## Simulate Predicted Values
## Step 1) Since we have a large sample size, we simulate the uncertainty
## about our coefficient estimates by randomly drawing from a multivariate normal
## distribution with a mean equal to a vector of the coefficients estimates and
## variance/covariance matrix equal to the variance-covariance matrix from our
## fitted model.
set.seed(02138) ## Setting seed to ensure replicability
simbetas <- rmvnorm(1000, mean=lm1$coefficients, sigma=vcov(lm1))
help(rmvnorm)
## Step 2) We pick the explanatory variable values for which we want to generate
## predicted values of the outcome variable. Let's say we are interested in the
## predicted state tax share in Arizona in 1970, if political competition were
## quite low: -.3 (the domain of possible values is [-.5, 0]). Taking into account
## the intercept, we want to create a vector of explanatory variable values where
## the Arizona state dummy variable is 1, the 1970 dummy variable is 1, the
## political competition variable is -.3, and all other variables are 0.
predCovs <- c( 1, -.3, 1, c(rep(0,times=length(4:68))), 1,
c(rep(0,times=length(70:100))) ) ## (intercept,compnorm,arizona,....,1970,...)
## Step 3) For each vector of randomly drawn parameters, we generate a predicted
## value by multiplying it with the vector of explanatory variable values. Note
## that we also incorporate fundamental uncertainty for each predicted value
## by drawing from the stochastic component of our estimator - in the case of
## OLS, a normal distribution.
set.seed(02138)
sim <- apply( simbetas, 1, function(x) {
rnorm( 1, mean=predCovs%*%x, sd=sd(residuals(lm1)) )} ) # We apply the rnorm(...
# function by row, where
# each row corresponds
# to a vector of
# coefficient estimates.
## Step 4) Finally, we plot the thousand predicted values we have simulated.
plot(density(sim), main="Predicted Value of Tax Share",
xlab="Tax Share",col="blue")
abline(v=mean(sim),col="red")
savePlot(filename ="Figs/BesleyPredicted.pdf", type="pdf")
## Simulate Expected Values
## To simulate expected values, we simply average over the fundamental uncertainty
## associated with predicted values to yield one expected value for m simulations
## of predicted values. We do this repeatedly in order to simulate the distribution
## of expected values.
set.seed(02138)
expectedsim <- apply( simbetas, 1, function(x) {
mean(rnorm( 1000, mean=predCovs%*%x, sd=sd(residuals(lm1))) )} )
## This is the resulting simulated distribution:
plot(density(expectedsim), main="Expected Value of Tax Share",
xlab="Tax Share",col="green")
abline(v=mean(expectedsim),col="yellow")
savePlot(filename ="Figs/BesleyExpected.pdf", type="pdf")
## Comparing Predicted and Expected Values
## Compare the simulated distributions of predicted and expected values:
plot(density(sim), main="Predicted and Expected Value of Tax Share",
xlab="Tax Share",col="blue",ylim=c(0,3))
abline(v=mean(sim),col="red")
points(density(expectedsim), type='l',col="green")
abline(v=mean(expectedsim),col="yellow")
savePlot(filename ="Figs/BesleyCompare.pdf", type="pdf")
## Things to note:
## The predicted values are more widely dispersed, because, as we have seen,
## they incorporate both fundamental and estimation uncertainty, while
## expected values incorporate only estimation uncertainty and average over
## fundamental uncertainty.
## The mean of both distributions is in expectation usually the same.
## This is because both incorporate draws from the fixed/systematic
## component of Y.
## Which one matters depends on what question you are asking! If you
## are interested in population average properties, you would be more
## interested in the distribution of expected values. If you are
## interested in causal effects, you would probably be interested in first
## differences in the distribution of expected values. If you are
## interested in making a one-off prediction, then you are better off
## looking at the distribution of predicted values.
## As a study's sample size approaches infinity, the distribution of
## expected values collapses to a spike, due to decreased estimation
## uncertainty. But the variance of predicted values does not
## approach 0, since there is always fundamental uncertainty, which
## has nothing to with how much data you have. This can be demonstrated
## by 'lying' to the computer and creating a much larger dataset by
## randomly sampling from our current dataset with replacement. (Note:
## we don't actually have any more information statistically, but we are
## approximating an imaginary situation where we did).
## Below is a function that creates a fake data set; you can choose the
## size. It repeats the procedure to simulate predicted and expected values
## of tax share, for explanatory variable values that correspond to Arizona
## in 1970, with political competition at -.3.
fake <- function(size){
set.seed(02138)
fakedata <- data[sample(1:nrow(data), size, replace = TRUE),]
lmfake <- lm(share_taxes_inc ~ compnorm + as.factor(state) + as.factor(year),
data = fakedata)
fakesimbetas <- rmvnorm(1000, mean=lmfake$coefficients, sigma=vcov(lmfake))
fakepredCovs <- c( 1, -.3, 1, c(rep(0,times=length(4:68))), 1,
c(rep(0,times=length(70:100))) )
fakesim <- apply( fakesimbetas, 1, function(x) {
rnorm( 1, mean=fakepredCovs%*%x, sd=sd(residuals(lmfake)) )} )
fakeexpectedsim <- apply( fakesimbetas, 1, function(x) {
mean(rnorm( 1000, mean=fakepredCovs%*%x, sd=sd(residuals(lmfake))) )} )
plot(density(fakesim), main=as.character(size),
xlab="Tax Share",col="blue",ylim=c(0,7))
abline(v=mean(fakesim),col="red")
points(density(fakeexpectedsim), type='l',col="green")
abline(v=mean(fakeexpectedsim),col="yellow")
}
## Note that as this imaginary dataset gets bigger, the expected
## value distribution collapses but the predicted value distribution
## does not!
par(mfrow=c(2,2))
fake(1000)
fake(4000)
fake(16000)
fake(64000)
savePlot(filename ="Figs/ExpectedPredictedCollapse.pdf", type="pdf")
dev.off()
#### Contributors: Ana Catalano, Erru Yang, Yanfang Su
#### April 20, 2011
#### Example of Simulating First Differences
#### Single-Equation Models: pg. 24
############################################################################
# This code shows how to simulate first differences in two #
# different ways, manually and using Zelig. #
############################################################################
rm(list=ls(all=TRUE))
## First creating a dataset to work with:
female<-c(1, 0, 1, 0, 0, 0, 1, 1, 0, 0)
educ<-c(2, 1, 2, 2, 3, 3, 1, 2, 2, 1)
job<-c(1, 1, 1, 0, 0, 1, 1, 0, 1, 0)
data<-as.data.frame(cbind(female, educ, job))
X <- cbind(1, female, educ) #intercept and covariates
y<- job #dependent variable
## Running a probit model to predict job
library(Zelig)
out<-zelig(factor(job)~female + educ, model = "probit", data = data)
betas<-out$coef
varcov<-vcov(out)
## Simulating coefficients from a normal distribution
library(mvtnorm)
library(pscl)
set.seed(02138) #remember to set the seed to replicate later
simbetas <- rmvnorm(10000,betas, varcov)
head(simbetas)
## Setting the covariates to their means
meanbetas <- c(apply(X, 2, FUN=mean))
## Setting the variable of interest to the values you care about
## (for continuous or ordinal variables like education this may be a min and max)
means.female<-meanbetas
means.female[2]<-1 #[2] because female comes second in your X
means.male<-meanbetas
means.male[2]<-0
first.diffs<-pnorm(means.female%*%t(simbetas)) - pnorm(means.male%*%t(simbetas))
mean(first.diffs) #difference in expected values
quantile(first.diffs, .025); quantile(first.diffs, .975) #confidence interval
##Example of Zelig doing this for you
x.low <- setx(out, female = 0)
x.high <- setx(out, female = 1)
s.out <- sim(out, x = x.low, x1 = x.high)
summary(s.out) #and you should get the same thing as the longer method, above.
plot(s.out)
#### Contributors: Leslie Finger and Adela Soliz
#### April 20, 2011
#### Reparameterizing on an Unbounded Scale and First Differences
#### Single-Equation Models: Pg. 25
######################################################################################
# This code shows that by reparameterizing pi in the logistic model on an unbounded #
# scale, so that pi=1/(1+exp(-Xbeta)) we are able to maximize the parameters more #
# easily. To sum up the steps, we first subset the data, then we write and optimize #
# functions for the logistic model first on an unbounded scale and then on a bounded #
# scale. Finally, we simulate first differences for certain interesting populations.#
######################################################################################
rm(list=ls(all=TRUE))
library(MASS)
library(Zelig)
## To demonstrate the use of reparameterizing on an unbounded scale,
## let's look at how welfare determines whether kids go to school (scbase) using a logit model.
load("Gov2001CodeLibrary.RData")
dat <- kids
head(dat)
## Subsets the data so it only includes data from 1998.
## This data set also includes 1999 observations, so this deletes any kids in the data set for
## both years. This data looks at a conditional cash transfer intervention starting in 98; by
## looking at scbase, we are only looking at school in 97, prior to the conditional cash transfer
## intervention.
dat98 <- dat[dat$year==98,]
unique(dat98$year)
## Subsets the data to include just the variables we care about (the outcome, scbase, the explanatory variable,
## welfare, and the controls: child's gender, indigenous status, grade, the head of household's education, sex,
## age, the number of family members, and the distance from secondary school).
dat1 <- subset(dat98, select=c(scbase, welfare, gender, indig, hohedu, hohage,
hohsex, fam_n, dist_sec),
drop=TRUE)
head(dat1)
## This drops the na's from the dataset. Ordinarily we would want to impute this missing data,
## but for the purpose of this exercise, we took this shortcut.
dat2<-na.omit(dat1)
dim(dat2)
head(dat2)
## Functions and Optimization
## There are two ways to find the parameter values- using your own function or using Zelig.
## Makes a function of the log likelihood of the logistic model.
logit.ll<-function(par, y, X){
betas<-par
X<-as.matrix(X)
ll<-sum((-y*log(1+exp(-X %*% betas)))+((1-y)*log(1-1/(1+exp(-X %*% betas)))))
return(ll)
}
## Defines the variables in the function: y is the first column of the data matrix,
## Xs are all the columns but the first one, and X is a matrix of the data plus a column of 1s
## at the beginning for the intercept.
head(dat2)
Xs<-as.matrix(dat2[,-1])
y<-as.vector(dat2$scbase)
X <- cbind(1,Xs)
head(X)
## Optimizes the function to find the parameter values. Our Zelig results are slightly different
## from those produced by optim. We believe this is because our function is badly behaved.
par <-rep(0,9)
opt.log<-optim(par, y=y, X=X, fn=logit.ll, method="BFGS", control=list(fnscale=-1), hessian=TRUE)
## Or you could optimize using zelig.
results <- zelig(scbase ~ welfare + gender + indig + hohedu + hohage + hohsex +
fam_n + dist_sec, data=dat2, model = "logit")
results$coef
summary(results)
vcov(results)
## What if we do not put the model on an unbounded scale, but rather set pi=(-X %*% beta)?
## Then our function becomes:
logit.llb<-function(par, y, X){
betas<-par
X<-as.matrix(X)
llb<-sum(-y*(-X %*% betas)+(1-y)*(1-(-X %*% betas)))
return(llb)
}
## And we optimize using the L-BFGS-B method which allows us to set upper and lower bounds.
## This is necessary because the parameters in the bernoulli model has to be between 0 and 1.
opt.logb<-optim(par=rep(0,10), y=y, X=X, fn=logit.llb, method="L-BFGS-B", lower=rep(0,10),
upper=rep(1,10), control=list(fnscale=-1), hessian=TRUE)
## Putting pi on an bounded scale causes problems when we try to optimize.
## That's why we put it on an unbounded scale.
## Simulations and First Differences
## Let's simulate the difference between the probability of school attendance changing
## the gender variable for boys and girls (Male = 1/Fem = 0) and setting all other variables
## at their medians.
set.seed(012345) #Set the seed so you can reproduce the results.
draws <- mvrnorm(10000, mu=results$coef, Sigma=vcov(results))
#Draw from a distribution defined by your optimization results.
head(draws)
nrow(draws)
meds <- apply(dat1[,-1], MARGIN=2, FUN=median)
#Sets all variables except the outcome to their median values.
meds
medians <- c(1, meds)
# Adds a 1 at the beginning of the vector of median values to account for the intercept.
## Makes a vector of all variables at their median values, except for gender which is set to 0
## for a girl and 1 for a boy.
boys <- medians
girls <- medians
girls["gender"] <- 0
girls
## Calculates a simulated value of pi for all variables set to their medians except gender.
pi.boys <- 1/(1+ exp(-boys %*% t(draws)))
pi.girls <- 1/(1+ exp(-girls %*% t(draws)))
firstdif <- pi.boys- pi.girls
mean(firstdif)
## Boys have a higher probability of attending school. This may be due to childbirth etc.
quantile(firstdif, .025); quantile(firstdif,.975)
## Finds quantiles by gender, to gauge certainty.
quantile(pi.boys, .025); quantile(pi.boys,.975)
quantile(pi.girls, .025); quantile(pi.girls,.975)
## We also calculated the expected value of pi for the whole model.
## This is not as useful as the first differences, but it gives us some idea of the probability of school
## enrollment for the average person in the dataset.
pi.model<-1/(1+exp(-medians%*%t(draws)))
mean(pi.model)
quantile(pi.model, .025); quantile(pi.model, .975)
## In sum, reparameterizing pi to an unbounded scale facilitates the optimization
## and allows us to simulate quantities of interest.
#### Contributors: Chilton, Crouch, Lavie
#### April 20, 2011
#### Graph on Confidence Intervals
#### Single-Equation Models: Pg. 28
######################################################################################
# The goal: create a graph that looks similar to the one on p. 28 to the #
# single equation slides with different data #
# #
# the data is taken from #
# "Ethnicity, Insurgency and Civil War," by James Fearon and David Laitin. #
# The article uses a number of variables to predict the onset of civil conflict. #
# #
######################################################################################
rm(list=ls(all=TRUE))
library(Zelig)
library (MASS)
## load the data from the relevant library
## removing unnecessary variables and cleaning
## the data
load("Gov2001CodeLibrary.RData")
vars.m1 <- c("onset","warl","gdpenl","lpopl1","lmtnest", "ncontig", "Oil",
"nwstate","instab","polity2l","ethfrac","relfrac")
fearonclean<-drop(fearon[vars.m1])
head(fearonclean)
fearonclean<-na.omit(fearonclean)
fearonclean["onset"]<-replace(fearonclean["onset"],
fearonclean["onset"]==4,1)
## running the basic logit regression
output<- zelig(onset ~ warl + gdpenl + lpopl1 + lmtnest + ncontig +
Oil + nwstate + instab + polity2l+ ethfrac + relfrac,
data = fearonclean, model = "logit")
## simulating the betas
coef<-output$coefficients
vcov<-summary(output)$cov.unscaled
set.seed(02108)
betas <- mvrnorm(n = 1000, mu = coef, Sigma = vcov)
## betas, then, is a simulated matrix with possible values of the coefficients
## now turning on to create simulated data, in this case the median data
fearonclean1<-as.matrix(fearonclean)
data<-c(rep(0,ncol(fearonclean1)))
names(data)<-c(colnames(fearonclean1))
for (i in 1:length(data)){
data[i]<-median(fearonclean1[,i])
}
data[1]<-1
names(data)[1]<-"Intercept"
## first differences- a country with 1 on nwstate (new states) parameter
datanwst<-data
datanwst["nwstate"]<-1
gdpseq <- seq(.25,7,.25)
## creating a matrix of non-new state countries where
## the difference is the changing GDP
datamat <- matrix(rep(data), nrow = length(gdpseq),
ncol = length(data), byrow = TRUE)
datamat[,3] <- gdpseq
## and a parallel matrix for nwstate=1 countries
datamatnwst <- matrix(rep(datanwst), nrow = length(gdpseq),
ncol = length(datanwst), byrow = TRUE)
datamatnwst[,3] <- gdpseq
colnames(datamat)<-c(names(data))
colnames(datamatnwst)<-c(names(datanwst))
## simulating the expected values
holder<-holdernwst<-matrix(NA, nrow=nrow(datamatnwst), ncol=3, byrow=TRUE)
temp<-c(rep(0,nrow(betas)))
for (i in 1:nrow(datamat)){
temp <-1/(1+exp(-datamat[i,]%*%t(betas)))
holder[i,1] <-mean(temp)
## the confidence intervals are the 2.5% "tails"
holder[i,2]<-sort(temp)[25]
holder[i,3]<-sort(temp)[975]
## same for the new states
temp <-1/(1+exp(-datamatnwst[i,]%*%t(betas)))
holdernwst[i,1] <-mean(temp)
holdernwst[i,2]<-sort(temp)[25]
holdernwst[i,3]<-sort(temp)[975]
}
pdf("Figs/ConfidenceIntervals.pdf")
plot(gdpseq, holder[,1], type = "l", ylim = c(0,.25), lwd = 2,
col = "darkblue", xlab = "GDP per capita (thousands of dollars)",
ylab = "Prob of Civil War Onset", cex = .1)
##creating the vertical confidence intervals
for (i in 1:length(gdpseq)){
segments(gdpseq[i], holder[i,2],gdpseq[i], holder[i,3],col = "darkblue",
lwd=1.2)
}
##now same for nwstates
lines(gdpseq, holdernwst[,1], col = "darkorange3", lwd = 2)
for (i in 1:length(gdpseq)){
segments(gdpseq[i], holdernwst[i,2],gdpseq[i], holdernwst[i,3],col =
"darkorange", lwd=1.2)
}
legend("topleft", legend = c("Regular Countries","New States"),
lwd = c(2,2), col = c("darkblue","darkorange3"),
cex = .6)
dev.off()
#### Contributors: Nils Hagerdal and Dan Masterson
#### April 20, 2011
#### Ternary Plots in R
#### Single-Equation Models: Pg. 36
######################################################################
# The idea is to create plots that show the distribution #
# of some variable for a range of subjects, where the #
# variable is split between 3 possible values that all #
# sum up to a constant: a + b + c = K #
# #
# Eg, UK vote share: Tories + Labour + Liberals = 100% #
# for each constituency. #
# #
# Here, we use the distribution of employment in European #
# countries: employment shares in the primary, secondary #
# and tertiary sectors for each country sum to 100%. #
# We use the package "ade4" and the "euro123" data set, #
# with code borrowed from: #
# http://addictedtor.free.fr/graphiques/RGraphGallery.php?graph=34 #
######################################################################
rm(list=ls(all=TRUE))
library(ade4)
data(euro123)
summary(euro123)
head(euro123)
colnames(euro123$in78) <- c("Primary", "Secondary", "Tertiary")
## triangle.plot() is the main command
pdf("Figs/TernaryPlot1.pdf")
triangle.plot(euro123$in78, clab = 0, cpoi = 0.9, addmean = TRUE, show = FALSE,
sub = "Employment by sector, EU 1978", csub = 1, possub = "topleft")
dev.off()
pdf("Figs/TernaryPlot2.pdf")
triangle.plot(euro123$in78, label = row.names(euro123$in78), clab = 0.5,
show = FALSE,
sub = "Employment by sector, EU 1978", csub = 1, possub = "topleft")
dev.off()
#### Contributors: Michael Hankinson & Jackelyn Hwang
#### April 20, 2011
#### fit and simulate quantities of interest from a grouped uncorrelated binary variables model
#### Single-Equation Models: Pg. 47
####################################################################################################
# In plain English: The Grouped Uncorrelated Binary Variable (GUBV) model is similar to a #
# binary logit, but we are looking at i.i.d. (uncorrelated) groups (grouped) of Bernoulli #
# trials (binary variables). In other words, we may only observe a total count of the binary #
# outcomes, not the outcome of each individual event. #
# #
# For instance, Gary uses the example of the number of times you voted out of the last 5 #
# elections. However, instead of observing each individual turnout, we receive the total number #
# of turnouts (coded as "1") as a single value. Were to you to vote in 4 elections, our outcome #
# would be "4". Yet, we have no idea specifically in which of the past five elections you voted, #
# but simply that you voted in four of them. #
# #
# How to Simulate Quantities of Interest - (raw instructions as drawn from G. King slides). #
# 1) Run optim and get beta-hat and the variance matrix #
# 2) Draw many values of beta-tilde from the multivariate normal with mean vector beta-tilde and #
# the variance matrix that come from optim. #
# 3) Set X to your choice of Values, X-sub"c" #
# 4) Calculate simulations of the probability that any of the component binary vairables is a one. #
# 5) If pi is of interest, summarize with mean, SD, CI's or histogram as needed. #
# 6) If simulations of y are needed, go one more step and draw y-tilde from Binomial(y|pi). #
####################################################################################################
rm(list=ls(all=TRUE))
## First, let's build some data to play with!
## Generate a true dataset of n=500 observations with 6 variables: Y, X1, X2, X3, X4, X5.
## The Xs will be drawn from a multivariate normal with means 0, variances 1, and the
## following correlation matrix
## Correlation matrix (1 -.12 -.1 .5 .1, -.12 1 .1 -.6 .1, -.1 .1 1 -.5 .1, .5 -.6 -.5 1 .1,
## .1 .1 .1 .1 1).
library(MASS)
set.seed(62788)
sigma <- matrix(c(1,-.12,-.1,.5,.1,-.12,1,.1,-.6,.1,-.1,.1,1,-.5,.1,.5,-.6,-.5,1,.1,.1,.1,.1,.1,1),5,5)
truedata <- mvrnorm(n=500,rep(0,5),sigma)
colnames(truedata) <- c("X1","X2","X3","X4","X5")
head(truedata)
## Now that we have our Xs, let's draw our Ys. Because we are looking for integers, we'll pull
## 5 draws from a binary distribution. These 5 draws will be summed into a single variable, our
## Y, representing the number of elections in which the individual voted.
Ys<-rbinom(500,5,.5)
Ys
## Great, now we cbind our Ys with our Xs.
GUBVdata1<-cbind(Ys,truedata)
head(GUBVdata1)
## Oh, and don't forget the intercept.
intercept<-matrix(c(rep(1,500)))
GUBVdata<-cbind(intercept,GUBVdata1)
head(GUBVdata)
## Fantastic!
## Let's try to understand Gary's instructions.
## 1) Run (optim) and get beta-hat and the variance matrix
## First, we need the function. Below is the GUBV function as taken from Gary's slides.
## lnL(Beta|y) = sum(N-y)*log(1+exp(X %*% beta)-y*log(1+exp(-X %*% beta)))
## Because we are looking at 5 elections, our N will be 5, the maximum number of events in
## which one can participate.
GUBV <- function(beta,y,X){
xbeta<- X%*% beta
out<- sum((5-y)*(log(1+exp(xbeta))-y*log(1+exp(-xbeta))))
return(out)
}
## Before running optim, we need to define our data components.
X<-as.matrix(GUBVdata1[,-1])
y<-as.vector(GUBVdata1[,1])
## And, now we'll run optim.
results<-optim(par=rep(0.1,5), y=y, X=X, fn=GUBV, method="BFGS", control=list(fnscale=-1), hessian=TRUE)
## 2) Draw many values of beta-tilde from the multivariate normal with mean vector beta-tilde and
## the variance matrix that come from optim.
## To get the mean vector beta-tilde and the variance matrix, we need to pull apart the optim results.
## Varaince Matrix
hessian <- as.matrix(results$hessian)
varcov <- solve(-hessian)
## Mean Vector
coef<-as.vector(unlist(results$par))
## Now, we'll draw our "many values of beta-tilde" from the multivariate normal. How about 500?
draws<-mvrnorm(n=500, mu=coef, varcov)
## 3) Set X to your choice of Values, X-sub"c"
## Let's create an individual based on the mean of each covariate. It may not occur in the real world,
## but it should fit within our data.
MeanGUBV<-matrix(c(mean(GUBVdata1[,2]), mean(GUBVdata1[,3]), mean(GUBVdata1[,4]),
mean(GUBVdata1[,5]), mean(GUBVdata1[,6])), nrow=1, ncol=5)
## 4) Calculate simulations of the probability that any of the component binary vairables is a one.
## Each of the 5 elections can register a 0 or 1, depending on whether our created individual decided to vote.
## We can apply our drawn values of beta-tilde to our created individual to calculate their probability of
## participating in any individual election.
pi.one<- 1/(1+exp(-MeanGUBV %*% t(draws)))
## Now, we have 500 simulated probabilities of whether our created individual will turn out to vote.
## 5) If pi is of interest, summarize with mean, SD, CI's or histogram as needed.
## I love visuals. Let's run some visual diagnostics.
## Mean
meanpi<-mean(pi.one)
## For ease, we'll make this a pretty vector.
tpi.one<-t(pi.one)
pi.one.vec<-as.vector(pi.one)
## Standard Deviation - always important
standev<-sd(as.vector(pi.one))
## Confidence Interval, with n=500
error<-qnorm(0.975)*standev/sqrt(500)
left<-meanpi-error
right<-meanpi+error
left
right
## Histogram, with Confidence Intervals
pdf(file="Figs/GroupedUnc.pdf")
hist(tpi.one, col="lavender", main="Histogram of Probabilities of One, With Confidence Interval", xlab="Probability of One")
abline(v=left, col="red")
abline(v=right, col="red")
dev.off()
## Wow, looks like a tight CI. Alright!
## 6) If simulations of y are needed, go one more step and draw y-tilde from Binomial(y|pi).
## Let's take 1000 draws of y, based on our new probability of each binary component.
## This will help simulate the number of elections in which our created individual will participate.
simy<-rbinom(1000, 5, meanpi)
median(simy)
## Three elections. 3 out of 5. Better than the average American, right?
## Congratulations! You've mastered Grouped Uncorrelated Binary Variable Modeling.
## What a relief! Now, back to Facebook.
#### Contributors: Frey and Walker
#### April 20, 2011
#### Variance in Poisson Models
#### Single-Equation Models: Pg. 55
#####################################################################################################
# We examine the variance assumptions of the Poisson count model. #
#####################################################################################################
rm(list=ls(all=TRUE))
load("Gov2001CodeLibrary.RData")
## Overdispersion
## Loading the Data
data<- Overdispersed
names(data)
## Altering "Treat"
data$treatf<-factor(data$treat)
## Generating a Poisson
PoissonOD<-glm(count ~ treatf + prior + age, family=poisson, data=data)
summary(PoissonOD)
## Calculating the Residuals of the Poisson Model
resid<-residuals.glm(PoissonOD, type="deviance")
fitted<-PoissonOD$fitted.values
## Calculating SSR: sum residuals^2/(#observations-#coefficients)
sum(resid^2)/(59-4)
## Comparing to Chi-Squared with df=#observations-#coefficients
1-pchisq(sum(resid^2),55)
## Plotting the Residuals
plot(fitted, resid, main="Residuals for Overdispersed Poisson Model",
xlab="Fitted Values", ylab="Deviance Residuals", ylim=c(-4,4))
## Correct Variance
## Generating Data
var1<-factor(c('A','A','A','A','A','A','A','A','A','A','B','B','B','B','B','B','B','B','B','B'))
var2<-factor(c('Y','N','Y','Y','Y','Y','N','Y','N','Y','N','Y','N','Y','Y','Y','Y','N','N','N'))
var3<-c(10,5,9,4,6,14,2,3,8,10,11,15,4,5,7,9,12,6,5,8)
var4<-c(12,15,13,5,8,17,10,6,22,14,20,18,11,11,12,13,28,10,14,10)
data<-data.frame(Treat, var1, var2, var3)
## Generating Poisson
PoissonCorrect<-glm(var3 ~ var1 + var2, family=poisson, data=data)
summary(PoissonCorrect)
## Residuals and Fitted Values
residC<-residuals.glm(PoissonCorrect, type="deviance")
fittedC<-PoissonCorrect$fitted.values
## Calculating SSR: sum sq resid / #observations - #coefficients
sum(residC^2)/(20-3)
## Chi-Squared
1-pchisq(sum(residC^2), 17)
## Plotting
plot(fittedC, residC, main="Residuals for Overdispersed Poisson Model",
xlab="Fitted Values", ylab="Deviance Residuals")
## NOTE: If you offset the data, the dispersion appears to be even smaller
## (the variance appears more correct)
## To see this, run the following:
PoissonOffset<-glm(var3 ~ var1 + var2, family=poisson, data=data, offset=log(var4))
residOffset<-residuals.glm(PoissonOffset, type="deviance")
fittedOffset<-PoissonOffset$fitted.values
1-pchisq(sum(residOffset^2), 17)
plot(fittedOffset, residOffset,
main="Residuals for Overdispersed Poisson Model",
xlab="Fitted Values", ylab="Deviance Residuals")
## Underdispersed Poission
probs<-prop.table(dbinom(0:10, 10, .5) +
ifelse(0:10==5,1,0))
dataUD<-sample(0:10, 1000, pr=probs, repl=TRUE)
obs<-c(1:1000)
data.UD<-data.frame(obs=obs, dataUD=dataUD)
## Generating a Poisson
PoissonUD<-glm(obs ~ dataUD, family=poisson, data=data.UD)
summary(PoissonUD)
## Calculating the Residuals of the Poisson Model
residUD<-residuals.glm(PoissonUD, type="deviance")
fittedUD<-PoissonUD$fitted.values
## Calculating SSR: sum residuals^2/(#observations-#coefficients)
sum(residUD^2)/(998)
## Comparing to Chi-Squared with df=#observations-#coefficients
1-pchisq(sum(residUD^2),998)
## Plotting the Residuals
plot(fittedUD, residUD, main="Residuals for Overdispersed Poisson Model",
xlab="Fitted Values", ylab="Deviance Residuals", ylim=c(-4,4))
#### Contributors: James Conran, Jeremy Ferwerda, Yue Hou
#### April 20, 2011
#### Logit vs. Probit
#### Single-Equation Models: No pg. #
#####################################################################################################
# Show that the logit and probit do not offer substantively different quantities of interest #
#####################################################################################################
rm(list=ls(all=TRUE))
## We'll start by visualizing the probit and logit functions
## First, we will plot the probability densities using R's canned functions rnorm and rlogis
## Number of iterations
n <- 10^6
## Set up the plot space
par(mfrow=c(1,2))
## Graph densities
plot(density(rnorm(n)),main="Density",xlab="x") # probit
lines(density(rlogis(n)),col="blue") # logit
legend("topleft",legend=c("Probit","Logit"),bty="n",cex=.7,lty=1,col=c("black","blue"))
## input values
test <- seq(-5,5,by=.01)
## Second, let's plot the inverse link functions
link.logit <- 1/(1+exp(-test))
link.probit <- pnorm(test)
## Graph inverse link functions
plot(test,link.probit,type="l",xlim=c(-5,5),main="Inverse Link Function",xlab="XB",ylab="pi") #probit
lines(test,link.logit,col="blue",type="l",xlim=c(-5,5)) #logit
legend("topleft",legend=c("Probit","Logit"),bty="n",cex=.7,lty=1,col=c("black","blue"))
savePlot(filename = "Figs/LogitProbit1.pdf", type="pdf")
## It should be clear that logit and probit are very similar, except at the tails of the distribution.
## As we will see in the next section, the difference in the tails does not result in substantially
## different estimates
## Concrete Example ##
## Use a default dataset that has a dichotomous variable
data(mtcars)
attach(mtcars)
## Let's test the probability that a car will have an automatic transmission (am),
## based upon its mpg rating and the number of cylinders.
## Create a matrix for the results and covariates
y <- as.matrix(am) # results - automatic transmission
X <- cbind(1, mpg, cyl) # covariates
## Define the log-likelihood functions. Although this can be done quickly using Zelig,
## these functions are included in order to show the log-likelihood of the inverse link functions.
ll.probit <- function(beta, y=y, X=X){
xbeta = X %*% beta
phi <- pnorm(xbeta, log = TRUE)
phi2 <- pnorm(xbeta, log = TRUE, lower.tail = FALSE)
result <- sum(y*phi + (1-y)*phi2)
return(result)
}
ll.logit <- function(beta, y=y, X=X){
xbeta = X %*% beta
result <- -sum(log(1+exp((1-(2*y))*xbeta)))
return(result)
}
## Use optim to find the results, inputting a vector of 0s as starting values
result_probit <- optim(par = c(rep(0,ncol(X))), fn = ll.probit, y=y, X=X,
control = list(fnscale = -1),method = "BFGS", hessian = TRUE)
result_logit <- optim(par = c(rep(0,ncol(X))), fn = ll.logit, y=y, X=X,
control = list(fnscale = -1),method = "BFGS", hessian = TRUE)
## Although the coefficients are different, the interpretation is not
result_probit$par
result_logit$par
## Calculate the probability of having an automatic transmission for each row in X
probit_o <- pnorm(X %*% result_probit$par)
logit_o <- 1/(1+exp(-X %*% result_logit$par))
## Table of predictions - both methods produce similar estimates
table <- cbind(probit_o,logit_o,am)
colnames(table) <- c("Probit Pr", "Logit Pr", "Automatic")
table
## Plot the resulting predictions
par(mfrow=c(1,1))
plot(density(probit_o),main="Logit vs Probit",xlab="",ylab="")
lines(density(logit_o),col="blue")
legend("topleft",legend=c("Probit","Logit"),bty="n",cex=.7,lty=1,col=c("black","blue"))
savePlot(filename = "Figs/LogitProbit2.pdf", type="pdf")
## As should be clear, the logit and probit result in very similar predictions
## Note that for a quick check, you can also follow Amemiya's (1981) and King's (1989) advice
## and multiply probit ceofficients by 1.6 in order to provide a rough comparison with the logit scale.
result_probit$par * 1.6
result_logit$par
#### Contributors: Sorapop Kiatpongsan and Slawa Rokicki
#### April 20, 2011
#### Exponential Models
#### Single-Equation Models: No pg. #
######################################################################################################
# We show how to estimate an exponential model in R and produce Hazard ratios and other quantities #
# of interest (using Zelig) #
# This code is to show how to fit the data into an exponential model in R and produce Hazard ratios #
# and other quantities of interest. #
# Data is from the course BIO 223 Applied Survival Analysis at Harvard School of Public Health. #
# This is the data of patients with bladder tumor with 8 covariates. #
# Group 1 = Control, 2 = Treatment #
# futime = follow up time #
# number = number of tumor #
# size = size of largest tumor #
# t1 = time at the first recurrence #
# t2 = time at the second recurrence #
# t3 = time at the third recurrence #
# t4 = time at the forth recurrence #
# Note that NA means that the data is censored. #
# Using data "bladder tumor" #
# Attached as txt file "bladder.tumor.noheader.txt" #
######################################################################################################
rm(list=ls(all=TRUE))
## Read Data
load("Gov2001CodeLibrary.RData")
## View the data
bladder
## Look at the distributions of baseline covariates
table(bladder$group)
table(bladder$number)
table(bladder$size)
## Now, using Zelig to estimate an exponential model
library(Zelig)
## Set random seed number
set.seed(02138)
## Use the exponential duration regression model if you have a dependent variable representing
## a duration (time until an event). The model assumes a constant hazard rate for all events.
## The dependent variable may be censored (for observations have not yet been completed when
## data were collected). Ref. Zelig manual
## Creating censoring indicator c = o if t1 is censored otherwise c = 1
## "time" is time to first recurrence or end of follow-up;
## "c" = 1 if subject experienced a recurrence (not censored);
c <- as.numeric(is.na(bladder$t1)==F)
time <- pmin(bladder$futime, bladder$t1, na.rm=T)
bladder <- cbind(bladder, c, time)
head(bladder)
## first observation has 0 follow-up time, not used in any analysis,
## drop first as it gives errors
bladder <- bladder[-1,]
## Now using Zelig to estimate and compare first recurrence time between treatment and control groups
z.out <- zelig(Surv(time, c) ~ group, model = "exp", data = bladder)
summary(z.out)
## Defining covariates for control and treated groups
x.control <- setx(z.out, group = 1)
x.treated <- setx(z.out, group = 2)
## To find a hazard ratio for an exponential model
## Note that the expected value = 1/hazard
## Then, to find hazard ratio is to find the ratio of the 1/expected value
s.out.control <- sim(z.out, x = x.control)
summary(s.out.control)
## E(Y/X) = 26.73509
## Hazard = 1/26.73509 = 0.03740403
s.out.treated <- sim(z.out, x = x.treated)
summary(s.out.treated)
## E(Y/X) = 45.64097
## Hazard = 1/45.64097 = 0.02191014
## Then, the hazard ratios of hazard of the control to the hazard of the
## treated group = 0.03740403/0.02191014 = 1.707156
## Finding other quantity of interest: The difference between the time to first recurrence
## between the control and treated groups
## Simulation to find expected values and first differences
s.out <- sim(z.out, x = x.control, x1 = x.treated)
summary(s.out)
plot(s.out)
## if you wanted to pull out the individual expected values:
expected.treated<-mean(s.out.treated$qi$ev)
expected.treated
expected.control<-mean(s.out.control$qi$ev)
expected.control
hazard.treated<-1/expected.treated
hazard.treated
hazard.control<-1/expected.control
hazard.control
Hazard.Ratio<-hazard.control/hazard.treated
Hazard.Ratio
#### Contributors: Maxwell Palmer & Andrew Hall
#### April 20, 2011
#### Weibull Models
#### Single-Equation Models: No pg. #
###############################################################################################
# Show how to estimate a weibull model in R and produce Hazard ratios and other quantities of #
# interest (using Zelig) #
# Note that the code and example here comes straight from #
# http://cran.r-project.org/web/packages/Zelig/vignettes/weibull.pdf #
# I merely add expository comments. #
###############################################################################################
rm(list=ls(all=TRUE))
library(Zelig)
## Use sample dataset from King, Alt, Burns, and Laver (1990)
data(coalition)
## Data description from original paper:
## "This data set contains survival data on government coalitions in parliamentary democracies
## (Belgium, Canada, Denmark, Finland, France, Iceland, Ireland, Israel, Italy, Netherlands, Norway,
## Portugal, Spain, Sweden, and the United Kingdom) for the period 1945-1987. For parsimony, country
## indicator variables are omitted in the sample data."
## We want to estimate a model in which survival times of coalitions follow a Weibull distribution.
## Our dependent variable is the observed survival time of a coalition.
## However, this is censored data because there is a known maximum duration
## governed by the country's election laws. These laws create what the authors call a
## "constitutional interelection period (CIEP)" which induces right-censoring on
## the duration observations.
## We can see how many right-censored observations we have:
table(coalition$ciep12)
## When ciep12 is 1, the coalition survived to the twelve months before the CIEP
## and therefore the duration value is censored. The true duration could be
## larger than the observed.
## Therefore we need to modify our dependent variable using Surv()
## The call to Surv() creates the "survival object", which takes into account the
## censored nature of the data.
## You pass in your time variable (duration here) and your event variable (ciep12 here)
## and you get back a Surv object you can use for survival models.
## Then you run zelig with the Surv object as the dependent variable.
## Given the Weibull model, zelig will find MLE estimates for your explanatory variables
## fract and numst2.
## fract reflects the number/size of parties in the parliament, with higher values meaning
## more fractionalization. numst2 is a dummy, 1 if majority government, 0 if minority government.
z.out <- zelig(Surv(duration, ciep12) ~ fract + numst2, model="weibull", data=coalition)
## Now we have our estimated model, so we can make simulations from it.
## For example, here we simulate to find the effect of being a majority government
## vs. being a minority
## government on the survival of the coalition.
x.low <- setx(z.out, numst2=0)
x.high <- setx(z.out, numst2=1)
s.out <- sim(z.out, x=x.low, x1=x.high)
summary(s.out)
plot(s.out)
#### Contributors: Colin Sullivan & Todd Kawakita
#### April 20, 2011
#### Cox Proportional Hazards Models
#### Single-Equation Models: No pg. #
###############################################################################################
# This brief guide will demonstrate Cox proportional hazards model in Zelig. We will use #
# data from Zelig and some fancy footwork to estimate a hazards model and produce quantities #
# of interest. This draws heavily on examples in the Zelig manual (written by Patrick Lam #
###############################################################################################
rm(list=ls(all=TRUE))
## Load the Zelig Package
library(Zelig)
## Load new data -
## We will be looking at a dataset in Zelig called "coalition2" - Coalition Dissolution in
## Parliamentary Democracies, Modified Version
data(coalition2)
## Let's have a look at the data:
head(coalition2)
summary(coalition2)
## Variables:
## duration - The length of duration in months of the cabinet coalition
## ciep12 - Binary variabel - did the coalition conclude its constitutional interelection
## period, or maximum length of duration?
## invest - Binary - Is there a legal requirement for legislative investiture?
## fract - An index of the number and size of parties in parliament
## polar - Measures polarization - support for extremist parties
## numst2 - Binary - Is this a majority government?
## crisis - The length of crisis preceding government formation (in days)
## country - No, this is not a trick. Name of country
## So, how long do coalitions last?
plot(density(coalition2$duration, bw = 1.5), xlim=c(0, 70), main = "Duration Density",
xlab="Cabinet Duration in Months")
abline(v=mean(coalition2$duration))
## Estimating the model
## Let's start by looking at just one covariate: invest
## The syntax: z.out <- zelig(Surv(Y, C) ~ X1 + X2, model = "coxph", data = mydata)
## where Y is the survival measure (duration) and C is the censoring mechanism (ciep12),
## and X1 and X2 are your covariates
ex1 <- zelig(Surv(duration, ciep12) ~ invest, model = "coxph", data = coalition2)
summary(ex1)
## Note that if your censorship mechanism is missing, Zelig will assume all observations are censored!
ex1.b <- zelig(Surv(duration) ~ invest, model = "coxph", data = coalition2)
summary(ex1.b)
## Running the model with several covariates
ex2 <- zelig(Surv(duration, ciep12) ~ invest + fract + polar + numst2 + crisis,
model = "coxph", data = coalition2)
summary(ex2)
## Generating quantities of interest
## Imagine you were interested in the impact of moving from the first to the third quartile
## of fractionalization. First, we find the quartiles. Next, we assign those values to
## the two sets of covariates, and hold all other covariates at their medians.
summary(coalition2$fract)
x.lowfrac <- setx(ex2, fract = 677)
x.highfrac <- setx(ex2, fract = 788)
## Next, we SIMULATE!!!
diff.frac <- sim(ex2, x = x.lowfrac, x1 = x.highfrac)
plot(diff.frac)
## VARIATIONS ON A THEME
## Stratified Cox Proportional Hazards Model
## What's the difference? Baseline hazards vary, but coefficents are the same, across strata.
ex3.strat <- zelig(Surv(duration, ciep12) ~ invest + fract + polar + numst2 +
strata(crisis), model = "coxph", data = coalition2)
x.lowstrat <- setx(ex3.strat, numst2 = 0, strata = "crisis=1")
x.highstrat <- setx(ex3.strat, numst2 = 1, strata = "crisis=1")
diff.strat <- sim(ex3.strat, x = x.lowstrat, x1 = x.highstrat)
summary(diff.strat)
## Cox Proportional Hazards Model with Time-Varying Covariates
## What's the difference? Let's assume that right now, you are not wrestling a bear and
## you have some predicted survival duration. If you someday find yourself wrestling a bear,
## your predicted survival time might decrease... slightly. Zelig allows your model to
## include variables whose value changes for a given observation over time.
## Your data should be long - that is, you should have multiple observations for
## each case with time-varying variables. If you are using quarterly unemployment
## to predict how long your goldfish survive, you will have one "q_unempl" variable and
## one observation for every quarter that your golfish is still alive.
## The intervals are not required to be regular (quarterly) - they can be any length, and
## the start and stop times of each period must be defined for each observation.
## The function format is as follows:
## z.out <- zelig(Surv(start, stop, censoring) ~ X1 + X2 + ..., model="coxph", data=data)
#### Contributors: Jeff Javed and Noam Gidron
#### April 20, 2011
#### Zero-Inflated Negative Binomial Regression
#### Single-Equation Models: No pg. #
#################################################################################################
# We wish to show how to construct a zero-inflated negative binomial model from scratch. First, #
# this code constructs a function for the log-likelihood of the zero-inflated negative binomial #
# model. Next, it estimates the log-likelihood using a data set on fishing (Available at: #
# http://www.ats.ucla.edu/stat/R/dae/fish.csv). Finally, it runs the same data through a canned #
# zero-inflated negative binomial function from the pscl library to check for accuracy. #
# #
# Explanation of the Zero-Inflated Negative Binomial #
# The zero-inflated negative binomial model is used for event counts where we are worried that #
# we have overdispersion due to a large amount of structural zeroes in our dependent variable. #
# Thus, if y=0, then there is a possibility pi that we will get a structural 0 and a possibility#
# 1 - pi that the zero was drawn from a negative binomial distribution. Thus, the probability #
# distribution for P(Y=y|lambda, theta, pi) is: #
# (pi + (1-pi)*negbinomial(0|lambda,theta))^d * (1-pi)*negbinomial(y|lambda, theta)^(1-d) #
# where d = 1 when y = 0, and d = 0 when y > 0. #
# #
# NB: This implementation may be a bit unstable at the gamma(theta) stage. A production-quality#
# implementation would use the lgamma() function (as done in the pscl package). Here we use a #
# simpler coding for pedagogical purposes. #
# #
#################################################################################################
rm(list=ls(all=TRUE))
## Start a function that takes in parameters, a matrix of explanatory variables (X),
## a matrix for the zero-inflated variables (Z), and the dependent variable (y).
ll.zibinom <- function(par, X, Z, y){
#Set beta, gamma, and theta. Beta will contain as many parameters as are in the X matrix,
#while gamma will contain as many parameters as are in the Z matrix. Theta is a single parameter.
beta <- par[1:ncol(X)]
theta <- par[(ncol(X) + 1)]
gamma <- par[(ncol(X) + 2):length(par)]
#Set lambda.
lambda <- exp(X%*%beta)
#Set pi.
pie <- 1/(1+exp(-Z%*%gamma))
#Create the probability density function for a negative binomial if y > 0.
negbinom <- (gamma(theta+y)/((factorial(y))*gamma(theta)))*((lambda^(y))*(theta^(theta)))/
((lambda + theta)^(theta+y))
#Create the probability density function for a negative binomial if y = 0.
negbinom.zero <- (theta^(theta))/((lambda + theta)^(theta))
#Set d so that d = 1 when y = 0, and d = 0 when y > 0.
#This will differentiate between instances where y = 0 and y > 0.
d <- ifelse(y==0, 1, 0)
#Write out the log-likelihood for the zero-inflated negative binomial function.
sum((d)*log(pie + (1-pie)*(negbinom.zero)) + (1-d)*log((1-pie)*negbinom))
}
## Estimating the Model
## We use data about the probability that a group of people in a park caught a fish.
## The explanatory variables are the number of children in the group, the number of people in the group,
## and whether or not the group brought a camper to a park. The dependent variable is the number of
## fish caught. The data and example are from UCLA Academic Technology Services.
## Available at: http://www.ats.ucla.edu/stat/stata/dae/zinb.htm.
## Load and view data.
load("Gov2001CodeLibrary.RData")
data <- fish
head(data)
## The fish count is our dependent variable y.
y <- data$count
## The child and camper variables are the independent variables (X). We add a column of 1s
## so that we can calculate an intercept.
X <- as.matrix(cbind(1, data$child, data$camper))
## The zero-inflated variable is number of people in a group. Again, we add a column of 1s
## so that we can calculate an intercept.
Z <- as.matrix(cbind(1, data$persons))
## Optimize the function. Number of parameters will equal the number of columns in X and Z plus 1.
out <- optim(par=rep(0.01, ncol(X) + ncol(Z) + 1), ll.zibinom, Z=Z, X=X,y=y, method="BFGS",
control=list(fnscale=-1), hessian=TRUE)
## Extract parameters.
out$par
## Extract log-likelihood.
out$value
## Compare to the Canned Function
## Load the pscl library.
library(pscl)
## Run the zero-inflated binomial model. Place the Z matrix variables after the |.
zinb <- zeroinfl(count ~ child + camper | persons, dist = "negbin", EM=TRUE, data=data)
## Check and compare results. Note that zeroinfl provides the log(theta), while our function
## provides an unlogged theta.
summary(zinb)
out$par
out$value