#### "Single Equation Models" #### Harvard Gov2001 Code Library #### Instructor: Gary King #### Maintainers: Margaret Roberts and Brandon Stewart #### May 1, 2011 #### Version 0.5 #### Contact: bstewart@fas.harvard.edu # Summary: This file covers single equation models. # # Table of Contents: # Show how the logit graph varies with xbeta # Generate data for probit via a latent variable # Why we simulate from the multivariate normal # Simulation of Predicted Values and Expected Values # Example of Simulating First Differences # Reparameterizing on an Unbounded Scale and First Differences # Graph on Confidence Intervals # Ternary Plots in R # Grouped uncorrelated binary variables model # Variance in Poisson Models # Logit vs. Probit # Exponential Models # Weibull Models # Cox Proportional Hazards Models # Zero-Inflated Negative Binomial Regression #### Contributors: Ariel White, Ben Schneer, Michael Gill #### April 20, 2011 #### Show how the logit graph varies with xbeta #### Single-Equation Models: pg. 2 ########################################################################################## # Attribution: Thanks to Andrew Gelman: # # http://www.stat.columbia.edu/~gelman/bugs.course/logit/logit.R # # # # In this exercise, we'll look at how the graph of the logit curve varies with xbeta. # # We'll generate different datasets, fit logit models to them, # # and plot the corresponding curves. # ########################################################################################## rm(list=ls(all=TRUE)) library(mvtnorm) invlogit <- function(x){ 1 / (1 + exp(-x)) }#setting up a function that will help us generate data ## so, let's generate some data with some parameters: x <- runif(100, -6, 6) y <- rbinom (100, 1, invlogit(x)) pdf(file="Figs/logitXB.pdf")#this starts making a pdf of the figure plot(x,y, col="white", main="Logit Curves") #this just sets up the plot; we'll add curves later ## now let's fit a logit to that, and plot the curve. logit.R <- glm (y ~ x, family=binomial(link="logit")) curve (invlogit (logit.R$coef[1] + logit.R$coef[2]*x),add=TRUE) ## if we do this with different values of x, we get different shapes: x1 <- runif(100, -5, 5) y1 <- rbinom (100, 1, invlogit(x1)) logit.1 <- glm (y1 ~ x1, family=binomial(link="logit")) curve (invlogit (logit.1$coef[1] + logit.1$coef[2]*x),add=TRUE) x2 <- runif(100, 0, 10) y2 <- rbinom (100, 1, invlogit(x2)) logit.2 <- glm (y2 ~ x2, family=binomial(link="logit")) #Note this throws an error to make a point. curve (invlogit (logit.2$coef[1] + logit.2$coef[2]*x),add=TRUE) dev.off() #end pdf ## we can change x around, and see how this changes the curve. #### Contributors: Ariel White, Ben Schneer, Michael Gill #### April 20, 2011 #### Generate data for probit via a latent variable #### Single-Equation Models: pg. 9 ################################################################################################# # Show how to generate binary data that could be modeled as a probit using y* as a # # continuous variable: # # Like the logit model, the probit model is used with binary outcome variables. In this example,# # our task is to simulate y* as a function of some covariate, then dichotomize it according to # # that story, and show that probit recovers the answer. # # # # Imagine we had a dataset of 10000 random individuals with information about their height, # # whether they are of high or low income, but their true income is onobserved (y*). To show the # # latent variable recovery of the probit, consider the following: # # We generate a y* using simulated data with some covariates beta and some X's: # ################################################################################################# rm(list=ls(all=TRUE)) N <- 10000 set.seed(02138) height <- runif(N, 5.5, 6.25) #Sample of 10000 peoples' heights hist(height, xlab="Height (in feet)", main="Frequency of Heights of Individuals in Sample (N=10000)") set.seed(02138) epsilon <- rnorm(N, 0, 10000) y.star <- 10000*height + epsilon #A person's income as a function of their height hist(y.star) highincome <- ifelse(y.star>60000,1,0) ## Here we set a cutoff point according to if their income is higher than 60000 dollars a year. ## Our merged dataset: data <- as.data.frame(cbind(y.star, highincome, height)) head(data) probit.reg <- glm(highincome ~ height, data=data, family=binomial(link="probit")) summary(probit.reg) ## Note: We see the intercept returned is proportionate to the cut-off point we ## set for high-income earners; ## We also see that the coefficient on hight is proportionate to the true coefficient on height used to ## generate y*. #### Contributors: Ariel White, Ben Schneer, Michael Gill #### April 20, 2011 #### Why we simulate from the multivariate normal #### Single-Equation Models: pg. 18 ############################################################################ # Suppose we ran a bivariate regression of y on x1 and x2 (w/ no constant) # # and recovered the estimates beta # # with covariance matrix var, estimated variance sigma2 # # and N = 200 # # We want to predict the outcome y when x1=1 and x2=1 # ############################################################################ rm(list=ls(all=TRUE)) library(MASS) beta<-c(5,-10) var<-matrix(c(10,.5,.5,4),nrow=2,ncol=2) sigma2<-10 N<-200 S<-sigma2*(N-3) ## Part A. Simulating y the proper way ## Draws from multivariate normal draws<-mvrnorm(1000,beta,sqrt(var)) sdraws<-sqrt(S/rchisq(1000, N-3, ncp=0)) X<-c(1,1) sims<-draws%*%X y<-sapply(sims,rnorm,n=1,sd=sdraws) ## Part B: Not using Multivariate normal draws<-cbind(rnorm(1000,beta[1],var[1,1]),rnorm(1000,beta[2],var[2,2])) sims<-draws%*%X y2<-sapply(sims,rnorm,n=1,sd=sdraws) ## Comparison of predicted distribution of y ## Notice the inflated variance of the draws from the wrong distribution ## because of the omission of the covariances pdf("Figs/SimulateMVNvsNorm.pdf",width=11,height=8.5) plot(density(y),col="red",xlim=c(-30,30),main="Draws from MV Normal vs. Normal") lines(density(y2),col="dodgerblue") legend("topright",legend=c("MVN","N"),lty=c(1,1),col=c("red","dodgerblue")) dev.off() #### Contributors: Aditya Dasgupta and Sean Ingham #### April 20, 2011 #### Simulation of Predicted Values and Expected Values #### Single-Equation Models: pg. 20 ############################################################################ # This example will use data from Besley, Persson and # # Sturm (2010), "Political Competition, Policy and Growth", which # # examines the effect of political competition on economic # # policies in US states over time. # ############################################################################ rm(list=ls(all=TRUE)) ## We will also need to simulate random draws from a multivariate ## normal distribution library(mvtnorm) ## Load Data load("Gov2001CodeLibrary.RData") data <- BesleyEtAl ## Let's replicate column 1 from table 2 of the article. This is an ## OLS regression of state tax revenue as a share of state income ## on political competition, with 'state and year fixed effects' (dummy ## variables for each state and for each year, save a year and a state ## reference category). Note that for the purposes of this example ## we will use the ordinary OLS standard errors, as opposed to the SEs ## adjusted for clustering, as in the article. lm1 <- lm(share_taxes_inc ~ compnorm + as.factor(state) + as.factor(year), data = data) lm1$coefficients[2]; sqrt(vcov(lm1)[2,2]) ## Note: the coefficient is the same as in the article, but the ## standard error is different because we have not adjusted for clustering! ## This is to keep the example simple. But keep in mind that the decision ## whether to to adjust standard errors or not impacts the predicted values ## and expected values, since standard error adjustments affect the ## variance-covariance matrix, and thus also our estimation uncertainty. ## Simulate Predicted Values ## Step 1) Since we have a large sample size, we simulate the uncertainty ## about our coefficient estimates by randomly drawing from a multivariate normal ## distribution with a mean equal to a vector of the coefficients estimates and ## variance/covariance matrix equal to the variance-covariance matrix from our ## fitted model. set.seed(02138) ## Setting seed to ensure replicability simbetas <- rmvnorm(1000, mean=lm1$coefficients, sigma=vcov(lm1)) help(rmvnorm) ## Step 2) We pick the explanatory variable values for which we want to generate ## predicted values of the outcome variable. Let's say we are interested in the ## predicted state tax share in Arizona in 1970, if political competition were ## quite low: -.3 (the domain of possible values is [-.5, 0]). Taking into account ## the intercept, we want to create a vector of explanatory variable values where ## the Arizona state dummy variable is 1, the 1970 dummy variable is 1, the ## political competition variable is -.3, and all other variables are 0. predCovs <- c( 1, -.3, 1, c(rep(0,times=length(4:68))), 1, c(rep(0,times=length(70:100))) ) ## (intercept,compnorm,arizona,....,1970,...) ## Step 3) For each vector of randomly drawn parameters, we generate a predicted ## value by multiplying it with the vector of explanatory variable values. Note ## that we also incorporate fundamental uncertainty for each predicted value ## by drawing from the stochastic component of our estimator - in the case of ## OLS, a normal distribution. set.seed(02138) sim <- apply( simbetas, 1, function(x) { rnorm( 1, mean=predCovs%*%x, sd=sd(residuals(lm1)) )} ) # We apply the rnorm(... # function by row, where # each row corresponds # to a vector of # coefficient estimates. ## Step 4) Finally, we plot the thousand predicted values we have simulated. plot(density(sim), main="Predicted Value of Tax Share", xlab="Tax Share",col="blue") abline(v=mean(sim),col="red") savePlot(filename ="Figs/BesleyPredicted.pdf", type="pdf") ## Simulate Expected Values ## To simulate expected values, we simply average over the fundamental uncertainty ## associated with predicted values to yield one expected value for m simulations ## of predicted values. We do this repeatedly in order to simulate the distribution ## of expected values. set.seed(02138) expectedsim <- apply( simbetas, 1, function(x) { mean(rnorm( 1000, mean=predCovs%*%x, sd=sd(residuals(lm1))) )} ) ## This is the resulting simulated distribution: plot(density(expectedsim), main="Expected Value of Tax Share", xlab="Tax Share",col="green") abline(v=mean(expectedsim),col="yellow") savePlot(filename ="Figs/BesleyExpected.pdf", type="pdf") ## Comparing Predicted and Expected Values ## Compare the simulated distributions of predicted and expected values: plot(density(sim), main="Predicted and Expected Value of Tax Share", xlab="Tax Share",col="blue",ylim=c(0,3)) abline(v=mean(sim),col="red") points(density(expectedsim), type='l',col="green") abline(v=mean(expectedsim),col="yellow") savePlot(filename ="Figs/BesleyCompare.pdf", type="pdf") ## Things to note: ## The predicted values are more widely dispersed, because, as we have seen, ## they incorporate both fundamental and estimation uncertainty, while ## expected values incorporate only estimation uncertainty and average over ## fundamental uncertainty. ## The mean of both distributions is in expectation usually the same. ## This is because both incorporate draws from the fixed/systematic ## component of Y. ## Which one matters depends on what question you are asking! If you ## are interested in population average properties, you would be more ## interested in the distribution of expected values. If you are ## interested in causal effects, you would probably be interested in first ## differences in the distribution of expected values. If you are ## interested in making a one-off prediction, then you are better off ## looking at the distribution of predicted values. ## As a study's sample size approaches infinity, the distribution of ## expected values collapses to a spike, due to decreased estimation ## uncertainty. But the variance of predicted values does not ## approach 0, since there is always fundamental uncertainty, which ## has nothing to with how much data you have. This can be demonstrated ## by 'lying' to the computer and creating a much larger dataset by ## randomly sampling from our current dataset with replacement. (Note: ## we don't actually have any more information statistically, but we are ## approximating an imaginary situation where we did). ## Below is a function that creates a fake data set; you can choose the ## size. It repeats the procedure to simulate predicted and expected values ## of tax share, for explanatory variable values that correspond to Arizona ## in 1970, with political competition at -.3. fake <- function(size){ set.seed(02138) fakedata <- data[sample(1:nrow(data), size, replace = TRUE),] lmfake <- lm(share_taxes_inc ~ compnorm + as.factor(state) + as.factor(year), data = fakedata) fakesimbetas <- rmvnorm(1000, mean=lmfake$coefficients, sigma=vcov(lmfake)) fakepredCovs <- c( 1, -.3, 1, c(rep(0,times=length(4:68))), 1, c(rep(0,times=length(70:100))) ) fakesim <- apply( fakesimbetas, 1, function(x) { rnorm( 1, mean=fakepredCovs%*%x, sd=sd(residuals(lmfake)) )} ) fakeexpectedsim <- apply( fakesimbetas, 1, function(x) { mean(rnorm( 1000, mean=fakepredCovs%*%x, sd=sd(residuals(lmfake))) )} ) plot(density(fakesim), main=as.character(size), xlab="Tax Share",col="blue",ylim=c(0,7)) abline(v=mean(fakesim),col="red") points(density(fakeexpectedsim), type='l',col="green") abline(v=mean(fakeexpectedsim),col="yellow") } ## Note that as this imaginary dataset gets bigger, the expected ## value distribution collapses but the predicted value distribution ## does not! par(mfrow=c(2,2)) fake(1000) fake(4000) fake(16000) fake(64000) savePlot(filename ="Figs/ExpectedPredictedCollapse.pdf", type="pdf") dev.off() #### Contributors: Ana Catalano, Erru Yang, Yanfang Su #### April 20, 2011 #### Example of Simulating First Differences #### Single-Equation Models: pg. 24 ############################################################################ # This code shows how to simulate first differences in two # # different ways, manually and using Zelig. # ############################################################################ rm(list=ls(all=TRUE)) ## First creating a dataset to work with: female<-c(1, 0, 1, 0, 0, 0, 1, 1, 0, 0) educ<-c(2, 1, 2, 2, 3, 3, 1, 2, 2, 1) job<-c(1, 1, 1, 0, 0, 1, 1, 0, 1, 0) data<-as.data.frame(cbind(female, educ, job)) X <- cbind(1, female, educ) #intercept and covariates y<- job #dependent variable ## Running a probit model to predict job library(Zelig) out<-zelig(factor(job)~female + educ, model = "probit", data = data) betas<-out$coef varcov<-vcov(out) ## Simulating coefficients from a normal distribution library(mvtnorm) library(pscl) set.seed(02138) #remember to set the seed to replicate later simbetas <- rmvnorm(10000,betas, varcov) head(simbetas) ## Setting the covariates to their means meanbetas <- c(apply(X, 2, FUN=mean)) ## Setting the variable of interest to the values you care about ## (for continuous or ordinal variables like education this may be a min and max) means.female<-meanbetas means.female[2]<-1 #[2] because female comes second in your X means.male<-meanbetas means.male[2]<-0 first.diffs<-pnorm(means.female%*%t(simbetas)) - pnorm(means.male%*%t(simbetas)) mean(first.diffs) #difference in expected values quantile(first.diffs, .025); quantile(first.diffs, .975) #confidence interval ##Example of Zelig doing this for you x.low <- setx(out, female = 0) x.high <- setx(out, female = 1) s.out <- sim(out, x = x.low, x1 = x.high) summary(s.out) #and you should get the same thing as the longer method, above. plot(s.out) #### Contributors: Leslie Finger and Adela Soliz #### April 20, 2011 #### Reparameterizing on an Unbounded Scale and First Differences #### Single-Equation Models: Pg. 25 ###################################################################################### # This code shows that by reparameterizing pi in the logistic model on an unbounded # # scale, so that pi=1/(1+exp(-Xbeta)) we are able to maximize the parameters more # # easily. To sum up the steps, we first subset the data, then we write and optimize # # functions for the logistic model first on an unbounded scale and then on a bounded # # scale. Finally, we simulate first differences for certain interesting populations.# ###################################################################################### rm(list=ls(all=TRUE)) library(MASS) library(Zelig) ## To demonstrate the use of reparameterizing on an unbounded scale, ## let's look at how welfare determines whether kids go to school (scbase) using a logit model. load("Gov2001CodeLibrary.RData") dat <- kids head(dat) ## Subsets the data so it only includes data from 1998. ## This data set also includes 1999 observations, so this deletes any kids in the data set for ## both years. This data looks at a conditional cash transfer intervention starting in 98; by ## looking at scbase, we are only looking at school in 97, prior to the conditional cash transfer ## intervention. dat98 <- dat[dat$year==98,] unique(dat98$year) ## Subsets the data to include just the variables we care about (the outcome, scbase, the explanatory variable, ## welfare, and the controls: child's gender, indigenous status, grade, the head of household's education, sex, ## age, the number of family members, and the distance from secondary school). dat1 <- subset(dat98, select=c(scbase, welfare, gender, indig, hohedu, hohage, hohsex, fam_n, dist_sec), drop=TRUE) head(dat1) ## This drops the na's from the dataset. Ordinarily we would want to impute this missing data, ## but for the purpose of this exercise, we took this shortcut. dat2<-na.omit(dat1) dim(dat2) head(dat2) ## Functions and Optimization ## There are two ways to find the parameter values- using your own function or using Zelig. ## Makes a function of the log likelihood of the logistic model. logit.ll<-function(par, y, X){ betas<-par X<-as.matrix(X) ll<-sum((-y*log(1+exp(-X %*% betas)))+((1-y)*log(1-1/(1+exp(-X %*% betas))))) return(ll) } ## Defines the variables in the function: y is the first column of the data matrix, ## Xs are all the columns but the first one, and X is a matrix of the data plus a column of 1s ## at the beginning for the intercept. head(dat2) Xs<-as.matrix(dat2[,-1]) y<-as.vector(dat2$scbase) X <- cbind(1,Xs) head(X) ## Optimizes the function to find the parameter values. Our Zelig results are slightly different ## from those produced by optim. We believe this is because our function is badly behaved. par <-rep(0,9) opt.log<-optim(par, y=y, X=X, fn=logit.ll, method="BFGS", control=list(fnscale=-1), hessian=TRUE) ## Or you could optimize using zelig. results <- zelig(scbase ~ welfare + gender + indig + hohedu + hohage + hohsex + fam_n + dist_sec, data=dat2, model = "logit") results$coef summary(results) vcov(results) ## What if we do not put the model on an unbounded scale, but rather set pi=(-X %*% beta)? ## Then our function becomes: logit.llb<-function(par, y, X){ betas<-par X<-as.matrix(X) llb<-sum(-y*(-X %*% betas)+(1-y)*(1-(-X %*% betas))) return(llb) } ## And we optimize using the L-BFGS-B method which allows us to set upper and lower bounds. ## This is necessary because the parameters in the bernoulli model has to be between 0 and 1. opt.logb<-optim(par=rep(0,10), y=y, X=X, fn=logit.llb, method="L-BFGS-B", lower=rep(0,10), upper=rep(1,10), control=list(fnscale=-1), hessian=TRUE) ## Putting pi on an bounded scale causes problems when we try to optimize. ## That's why we put it on an unbounded scale. ## Simulations and First Differences ## Let's simulate the difference between the probability of school attendance changing ## the gender variable for boys and girls (Male = 1/Fem = 0) and setting all other variables ## at their medians. set.seed(012345) #Set the seed so you can reproduce the results. draws <- mvrnorm(10000, mu=results$coef, Sigma=vcov(results)) #Draw from a distribution defined by your optimization results. head(draws) nrow(draws) meds <- apply(dat1[,-1], MARGIN=2, FUN=median) #Sets all variables except the outcome to their median values. meds medians <- c(1, meds) # Adds a 1 at the beginning of the vector of median values to account for the intercept. ## Makes a vector of all variables at their median values, except for gender which is set to 0 ## for a girl and 1 for a boy. boys <- medians girls <- medians girls["gender"] <- 0 girls ## Calculates a simulated value of pi for all variables set to their medians except gender. pi.boys <- 1/(1+ exp(-boys %*% t(draws))) pi.girls <- 1/(1+ exp(-girls %*% t(draws))) firstdif <- pi.boys- pi.girls mean(firstdif) ## Boys have a higher probability of attending school. This may be due to childbirth etc. quantile(firstdif, .025); quantile(firstdif,.975) ## Finds quantiles by gender, to gauge certainty. quantile(pi.boys, .025); quantile(pi.boys,.975) quantile(pi.girls, .025); quantile(pi.girls,.975) ## We also calculated the expected value of pi for the whole model. ## This is not as useful as the first differences, but it gives us some idea of the probability of school ## enrollment for the average person in the dataset. pi.model<-1/(1+exp(-medians%*%t(draws))) mean(pi.model) quantile(pi.model, .025); quantile(pi.model, .975) ## In sum, reparameterizing pi to an unbounded scale facilitates the optimization ## and allows us to simulate quantities of interest. #### Contributors: Chilton, Crouch, Lavie #### April 20, 2011 #### Graph on Confidence Intervals #### Single-Equation Models: Pg. 28 ###################################################################################### # The goal: create a graph that looks similar to the one on p. 28 to the # # single equation slides with different data # # # # the data is taken from # # "Ethnicity, Insurgency and Civil War," by James Fearon and David Laitin. # # The article uses a number of variables to predict the onset of civil conflict. # # # ###################################################################################### rm(list=ls(all=TRUE)) library(Zelig) library (MASS) ## load the data from the relevant library ## removing unnecessary variables and cleaning ## the data load("Gov2001CodeLibrary.RData") vars.m1 <- c("onset","warl","gdpenl","lpopl1","lmtnest", "ncontig", "Oil", "nwstate","instab","polity2l","ethfrac","relfrac") fearonclean<-drop(fearon[vars.m1]) head(fearonclean) fearonclean<-na.omit(fearonclean) fearonclean["onset"]<-replace(fearonclean["onset"], fearonclean["onset"]==4,1) ## running the basic logit regression output<- zelig(onset ~ warl + gdpenl + lpopl1 + lmtnest + ncontig + Oil + nwstate + instab + polity2l+ ethfrac + relfrac, data = fearonclean, model = "logit") ## simulating the betas coef<-output$coefficients vcov<-summary(output)$cov.unscaled set.seed(02108) betas <- mvrnorm(n = 1000, mu = coef, Sigma = vcov) ## betas, then, is a simulated matrix with possible values of the coefficients ## now turning on to create simulated data, in this case the median data fearonclean1<-as.matrix(fearonclean) data<-c(rep(0,ncol(fearonclean1))) names(data)<-c(colnames(fearonclean1)) for (i in 1:length(data)){ data[i]<-median(fearonclean1[,i]) } data[1]<-1 names(data)[1]<-"Intercept" ## first differences- a country with 1 on nwstate (new states) parameter datanwst<-data datanwst["nwstate"]<-1 gdpseq <- seq(.25,7,.25) ## creating a matrix of non-new state countries where ## the difference is the changing GDP datamat <- matrix(rep(data), nrow = length(gdpseq), ncol = length(data), byrow = TRUE) datamat[,3] <- gdpseq ## and a parallel matrix for nwstate=1 countries datamatnwst <- matrix(rep(datanwst), nrow = length(gdpseq), ncol = length(datanwst), byrow = TRUE) datamatnwst[,3] <- gdpseq colnames(datamat)<-c(names(data)) colnames(datamatnwst)<-c(names(datanwst)) ## simulating the expected values holder<-holdernwst<-matrix(NA, nrow=nrow(datamatnwst), ncol=3, byrow=TRUE) temp<-c(rep(0,nrow(betas))) for (i in 1:nrow(datamat)){ temp <-1/(1+exp(-datamat[i,]%*%t(betas))) holder[i,1] <-mean(temp) ## the confidence intervals are the 2.5% "tails" holder[i,2]<-sort(temp)[25] holder[i,3]<-sort(temp)[975] ## same for the new states temp <-1/(1+exp(-datamatnwst[i,]%*%t(betas))) holdernwst[i,1] <-mean(temp) holdernwst[i,2]<-sort(temp)[25] holdernwst[i,3]<-sort(temp)[975] } pdf("Figs/ConfidenceIntervals.pdf") plot(gdpseq, holder[,1], type = "l", ylim = c(0,.25), lwd = 2, col = "darkblue", xlab = "GDP per capita (thousands of dollars)", ylab = "Prob of Civil War Onset", cex = .1) ##creating the vertical confidence intervals for (i in 1:length(gdpseq)){ segments(gdpseq[i], holder[i,2],gdpseq[i], holder[i,3],col = "darkblue", lwd=1.2) } ##now same for nwstates lines(gdpseq, holdernwst[,1], col = "darkorange3", lwd = 2) for (i in 1:length(gdpseq)){ segments(gdpseq[i], holdernwst[i,2],gdpseq[i], holdernwst[i,3],col = "darkorange", lwd=1.2) } legend("topleft", legend = c("Regular Countries","New States"), lwd = c(2,2), col = c("darkblue","darkorange3"), cex = .6) dev.off() #### Contributors: Nils Hagerdal and Dan Masterson #### April 20, 2011 #### Ternary Plots in R #### Single-Equation Models: Pg. 36 ###################################################################### # The idea is to create plots that show the distribution # # of some variable for a range of subjects, where the # # variable is split between 3 possible values that all # # sum up to a constant: a + b + c = K # # # # Eg, UK vote share: Tories + Labour + Liberals = 100% # # for each constituency. # # # # Here, we use the distribution of employment in European # # countries: employment shares in the primary, secondary # # and tertiary sectors for each country sum to 100%. # # We use the package "ade4" and the "euro123" data set, # # with code borrowed from: # # http://addictedtor.free.fr/graphiques/RGraphGallery.php?graph=34 # ###################################################################### rm(list=ls(all=TRUE)) library(ade4) data(euro123) summary(euro123) head(euro123) colnames(euro123$in78) <- c("Primary", "Secondary", "Tertiary") ## triangle.plot() is the main command pdf("Figs/TernaryPlot1.pdf") triangle.plot(euro123$in78, clab = 0, cpoi = 0.9, addmean = TRUE, show = FALSE, sub = "Employment by sector, EU 1978", csub = 1, possub = "topleft") dev.off() pdf("Figs/TernaryPlot2.pdf") triangle.plot(euro123$in78, label = row.names(euro123$in78), clab = 0.5, show = FALSE, sub = "Employment by sector, EU 1978", csub = 1, possub = "topleft") dev.off() #### Contributors: Michael Hankinson & Jackelyn Hwang #### April 20, 2011 #### fit and simulate quantities of interest from a grouped uncorrelated binary variables model #### Single-Equation Models: Pg. 47 #################################################################################################### # In plain English: The Grouped Uncorrelated Binary Variable (GUBV) model is similar to a # # binary logit, but we are looking at i.i.d. (uncorrelated) groups (grouped) of Bernoulli # # trials (binary variables). In other words, we may only observe a total count of the binary # # outcomes, not the outcome of each individual event. # # # # For instance, Gary uses the example of the number of times you voted out of the last 5 # # elections. However, instead of observing each individual turnout, we receive the total number # # of turnouts (coded as "1") as a single value. Were to you to vote in 4 elections, our outcome # # would be "4". Yet, we have no idea specifically in which of the past five elections you voted, # # but simply that you voted in four of them. # # # # How to Simulate Quantities of Interest - (raw instructions as drawn from G. King slides). # # 1) Run optim and get beta-hat and the variance matrix # # 2) Draw many values of beta-tilde from the multivariate normal with mean vector beta-tilde and # # the variance matrix that come from optim. # # 3) Set X to your choice of Values, X-sub"c" # # 4) Calculate simulations of the probability that any of the component binary vairables is a one. # # 5) If pi is of interest, summarize with mean, SD, CI's or histogram as needed. # # 6) If simulations of y are needed, go one more step and draw y-tilde from Binomial(y|pi). # #################################################################################################### rm(list=ls(all=TRUE)) ## First, let's build some data to play with! ## Generate a true dataset of n=500 observations with 6 variables: Y, X1, X2, X3, X4, X5. ## The Xs will be drawn from a multivariate normal with means 0, variances 1, and the ## following correlation matrix ## Correlation matrix (1 -.12 -.1 .5 .1, -.12 1 .1 -.6 .1, -.1 .1 1 -.5 .1, .5 -.6 -.5 1 .1, ## .1 .1 .1 .1 1). library(MASS) set.seed(62788) sigma <- matrix(c(1,-.12,-.1,.5,.1,-.12,1,.1,-.6,.1,-.1,.1,1,-.5,.1,.5,-.6,-.5,1,.1,.1,.1,.1,.1,1),5,5) truedata <- mvrnorm(n=500,rep(0,5),sigma) colnames(truedata) <- c("X1","X2","X3","X4","X5") head(truedata) ## Now that we have our Xs, let's draw our Ys. Because we are looking for integers, we'll pull ## 5 draws from a binary distribution. These 5 draws will be summed into a single variable, our ## Y, representing the number of elections in which the individual voted. Ys<-rbinom(500,5,.5) Ys ## Great, now we cbind our Ys with our Xs. GUBVdata1<-cbind(Ys,truedata) head(GUBVdata1) ## Oh, and don't forget the intercept. intercept<-matrix(c(rep(1,500))) GUBVdata<-cbind(intercept,GUBVdata1) head(GUBVdata) ## Fantastic! ## Let's try to understand Gary's instructions. ## 1) Run (optim) and get beta-hat and the variance matrix ## First, we need the function. Below is the GUBV function as taken from Gary's slides. ## lnL(Beta|y) = sum(N-y)*log(1+exp(X %*% beta)-y*log(1+exp(-X %*% beta))) ## Because we are looking at 5 elections, our N will be 5, the maximum number of events in ## which one can participate. GUBV <- function(beta,y,X){ xbeta<- X%*% beta out<- sum((5-y)*(log(1+exp(xbeta))-y*log(1+exp(-xbeta)))) return(out) } ## Before running optim, we need to define our data components. X<-as.matrix(GUBVdata1[,-1]) y<-as.vector(GUBVdata1[,1]) ## And, now we'll run optim. results<-optim(par=rep(0.1,5), y=y, X=X, fn=GUBV, method="BFGS", control=list(fnscale=-1), hessian=TRUE) ## 2) Draw many values of beta-tilde from the multivariate normal with mean vector beta-tilde and ## the variance matrix that come from optim. ## To get the mean vector beta-tilde and the variance matrix, we need to pull apart the optim results. ## Varaince Matrix hessian <- as.matrix(results$hessian) varcov <- solve(-hessian) ## Mean Vector coef<-as.vector(unlist(results$par)) ## Now, we'll draw our "many values of beta-tilde" from the multivariate normal. How about 500? draws<-mvrnorm(n=500, mu=coef, varcov) ## 3) Set X to your choice of Values, X-sub"c" ## Let's create an individual based on the mean of each covariate. It may not occur in the real world, ## but it should fit within our data. MeanGUBV<-matrix(c(mean(GUBVdata1[,2]), mean(GUBVdata1[,3]), mean(GUBVdata1[,4]), mean(GUBVdata1[,5]), mean(GUBVdata1[,6])), nrow=1, ncol=5) ## 4) Calculate simulations of the probability that any of the component binary vairables is a one. ## Each of the 5 elections can register a 0 or 1, depending on whether our created individual decided to vote. ## We can apply our drawn values of beta-tilde to our created individual to calculate their probability of ## participating in any individual election. pi.one<- 1/(1+exp(-MeanGUBV %*% t(draws))) ## Now, we have 500 simulated probabilities of whether our created individual will turn out to vote. ## 5) If pi is of interest, summarize with mean, SD, CI's or histogram as needed. ## I love visuals. Let's run some visual diagnostics. ## Mean meanpi<-mean(pi.one) ## For ease, we'll make this a pretty vector. tpi.one<-t(pi.one) pi.one.vec<-as.vector(pi.one) ## Standard Deviation - always important standev<-sd(as.vector(pi.one)) ## Confidence Interval, with n=500 error<-qnorm(0.975)*standev/sqrt(500) left<-meanpi-error right<-meanpi+error left right ## Histogram, with Confidence Intervals pdf(file="Figs/GroupedUnc.pdf") hist(tpi.one, col="lavender", main="Histogram of Probabilities of One, With Confidence Interval", xlab="Probability of One") abline(v=left, col="red") abline(v=right, col="red") dev.off() ## Wow, looks like a tight CI. Alright! ## 6) If simulations of y are needed, go one more step and draw y-tilde from Binomial(y|pi). ## Let's take 1000 draws of y, based on our new probability of each binary component. ## This will help simulate the number of elections in which our created individual will participate. simy<-rbinom(1000, 5, meanpi) median(simy) ## Three elections. 3 out of 5. Better than the average American, right? ## Congratulations! You've mastered Grouped Uncorrelated Binary Variable Modeling. ## What a relief! Now, back to Facebook. #### Contributors: Frey and Walker #### April 20, 2011 #### Variance in Poisson Models #### Single-Equation Models: Pg. 55 ##################################################################################################### # We examine the variance assumptions of the Poisson count model. # ##################################################################################################### rm(list=ls(all=TRUE)) load("Gov2001CodeLibrary.RData") ## Overdispersion ## Loading the Data data<- Overdispersed names(data) ## Altering "Treat" data$treatf<-factor(data$treat) ## Generating a Poisson PoissonOD<-glm(count ~ treatf + prior + age, family=poisson, data=data) summary(PoissonOD) ## Calculating the Residuals of the Poisson Model resid<-residuals.glm(PoissonOD, type="deviance") fitted<-PoissonOD$fitted.values ## Calculating SSR: sum residuals^2/(#observations-#coefficients) sum(resid^2)/(59-4) ## Comparing to Chi-Squared with df=#observations-#coefficients 1-pchisq(sum(resid^2),55) ## Plotting the Residuals plot(fitted, resid, main="Residuals for Overdispersed Poisson Model", xlab="Fitted Values", ylab="Deviance Residuals", ylim=c(-4,4)) ## Correct Variance ## Generating Data var1<-factor(c('A','A','A','A','A','A','A','A','A','A','B','B','B','B','B','B','B','B','B','B')) var2<-factor(c('Y','N','Y','Y','Y','Y','N','Y','N','Y','N','Y','N','Y','Y','Y','Y','N','N','N')) var3<-c(10,5,9,4,6,14,2,3,8,10,11,15,4,5,7,9,12,6,5,8) var4<-c(12,15,13,5,8,17,10,6,22,14,20,18,11,11,12,13,28,10,14,10) data<-data.frame(Treat, var1, var2, var3) ## Generating Poisson PoissonCorrect<-glm(var3 ~ var1 + var2, family=poisson, data=data) summary(PoissonCorrect) ## Residuals and Fitted Values residC<-residuals.glm(PoissonCorrect, type="deviance") fittedC<-PoissonCorrect$fitted.values ## Calculating SSR: sum sq resid / #observations - #coefficients sum(residC^2)/(20-3) ## Chi-Squared 1-pchisq(sum(residC^2), 17) ## Plotting plot(fittedC, residC, main="Residuals for Overdispersed Poisson Model", xlab="Fitted Values", ylab="Deviance Residuals") ## NOTE: If you offset the data, the dispersion appears to be even smaller ## (the variance appears more correct) ## To see this, run the following: PoissonOffset<-glm(var3 ~ var1 + var2, family=poisson, data=data, offset=log(var4)) residOffset<-residuals.glm(PoissonOffset, type="deviance") fittedOffset<-PoissonOffset$fitted.values 1-pchisq(sum(residOffset^2), 17) plot(fittedOffset, residOffset, main="Residuals for Overdispersed Poisson Model", xlab="Fitted Values", ylab="Deviance Residuals") ## Underdispersed Poission probs<-prop.table(dbinom(0:10, 10, .5) + ifelse(0:10==5,1,0)) dataUD<-sample(0:10, 1000, pr=probs, repl=TRUE) obs<-c(1:1000) data.UD<-data.frame(obs=obs, dataUD=dataUD) ## Generating a Poisson PoissonUD<-glm(obs ~ dataUD, family=poisson, data=data.UD) summary(PoissonUD) ## Calculating the Residuals of the Poisson Model residUD<-residuals.glm(PoissonUD, type="deviance") fittedUD<-PoissonUD$fitted.values ## Calculating SSR: sum residuals^2/(#observations-#coefficients) sum(residUD^2)/(998) ## Comparing to Chi-Squared with df=#observations-#coefficients 1-pchisq(sum(residUD^2),998) ## Plotting the Residuals plot(fittedUD, residUD, main="Residuals for Overdispersed Poisson Model", xlab="Fitted Values", ylab="Deviance Residuals", ylim=c(-4,4)) #### Contributors: James Conran, Jeremy Ferwerda, Yue Hou #### April 20, 2011 #### Logit vs. Probit #### Single-Equation Models: No pg. # ##################################################################################################### # Show that the logit and probit do not offer substantively different quantities of interest # ##################################################################################################### rm(list=ls(all=TRUE)) ## We'll start by visualizing the probit and logit functions ## First, we will plot the probability densities using R's canned functions rnorm and rlogis ## Number of iterations n <- 10^6 ## Set up the plot space par(mfrow=c(1,2)) ## Graph densities plot(density(rnorm(n)),main="Density",xlab="x") # probit lines(density(rlogis(n)),col="blue") # logit legend("topleft",legend=c("Probit","Logit"),bty="n",cex=.7,lty=1,col=c("black","blue")) ## input values test <- seq(-5,5,by=.01) ## Second, let's plot the inverse link functions link.logit <- 1/(1+exp(-test)) link.probit <- pnorm(test) ## Graph inverse link functions plot(test,link.probit,type="l",xlim=c(-5,5),main="Inverse Link Function",xlab="XB",ylab="pi") #probit lines(test,link.logit,col="blue",type="l",xlim=c(-5,5)) #logit legend("topleft",legend=c("Probit","Logit"),bty="n",cex=.7,lty=1,col=c("black","blue")) savePlot(filename = "Figs/LogitProbit1.pdf", type="pdf") ## It should be clear that logit and probit are very similar, except at the tails of the distribution. ## As we will see in the next section, the difference in the tails does not result in substantially ## different estimates ## Concrete Example ## ## Use a default dataset that has a dichotomous variable data(mtcars) attach(mtcars) ## Let's test the probability that a car will have an automatic transmission (am), ## based upon its mpg rating and the number of cylinders. ## Create a matrix for the results and covariates y <- as.matrix(am) # results - automatic transmission X <- cbind(1, mpg, cyl) # covariates ## Define the log-likelihood functions. Although this can be done quickly using Zelig, ## these functions are included in order to show the log-likelihood of the inverse link functions. ll.probit <- function(beta, y=y, X=X){ xbeta = X %*% beta phi <- pnorm(xbeta, log = TRUE) phi2 <- pnorm(xbeta, log = TRUE, lower.tail = FALSE) result <- sum(y*phi + (1-y)*phi2) return(result) } ll.logit <- function(beta, y=y, X=X){ xbeta = X %*% beta result <- -sum(log(1+exp((1-(2*y))*xbeta))) return(result) } ## Use optim to find the results, inputting a vector of 0s as starting values result_probit <- optim(par = c(rep(0,ncol(X))), fn = ll.probit, y=y, X=X, control = list(fnscale = -1),method = "BFGS", hessian = TRUE) result_logit <- optim(par = c(rep(0,ncol(X))), fn = ll.logit, y=y, X=X, control = list(fnscale = -1),method = "BFGS", hessian = TRUE) ## Although the coefficients are different, the interpretation is not result_probit$par result_logit$par ## Calculate the probability of having an automatic transmission for each row in X probit_o <- pnorm(X %*% result_probit$par) logit_o <- 1/(1+exp(-X %*% result_logit$par)) ## Table of predictions - both methods produce similar estimates table <- cbind(probit_o,logit_o,am) colnames(table) <- c("Probit Pr", "Logit Pr", "Automatic") table ## Plot the resulting predictions par(mfrow=c(1,1)) plot(density(probit_o),main="Logit vs Probit",xlab="",ylab="") lines(density(logit_o),col="blue") legend("topleft",legend=c("Probit","Logit"),bty="n",cex=.7,lty=1,col=c("black","blue")) savePlot(filename = "Figs/LogitProbit2.pdf", type="pdf") ## As should be clear, the logit and probit result in very similar predictions ## Note that for a quick check, you can also follow Amemiya's (1981) and King's (1989) advice ## and multiply probit ceofficients by 1.6 in order to provide a rough comparison with the logit scale. result_probit$par * 1.6 result_logit$par #### Contributors: Sorapop Kiatpongsan and Slawa Rokicki #### April 20, 2011 #### Exponential Models #### Single-Equation Models: No pg. # ###################################################################################################### # We show how to estimate an exponential model in R and produce Hazard ratios and other quantities # # of interest (using Zelig) # # This code is to show how to fit the data into an exponential model in R and produce Hazard ratios # # and other quantities of interest. # # Data is from the course BIO 223 Applied Survival Analysis at Harvard School of Public Health. # # This is the data of patients with bladder tumor with 8 covariates. # # Group 1 = Control, 2 = Treatment # # futime = follow up time # # number = number of tumor # # size = size of largest tumor # # t1 = time at the first recurrence # # t2 = time at the second recurrence # # t3 = time at the third recurrence # # t4 = time at the forth recurrence # # Note that NA means that the data is censored. # # Using data "bladder tumor" # # Attached as txt file "bladder.tumor.noheader.txt" # ###################################################################################################### rm(list=ls(all=TRUE)) ## Read Data load("Gov2001CodeLibrary.RData") ## View the data bladder ## Look at the distributions of baseline covariates table(bladder$group) table(bladder$number) table(bladder$size) ## Now, using Zelig to estimate an exponential model library(Zelig) ## Set random seed number set.seed(02138) ## Use the exponential duration regression model if you have a dependent variable representing ## a duration (time until an event). The model assumes a constant hazard rate for all events. ## The dependent variable may be censored (for observations have not yet been completed when ## data were collected). Ref. Zelig manual ## Creating censoring indicator c = o if t1 is censored otherwise c = 1 ## "time" is time to first recurrence or end of follow-up; ## "c" = 1 if subject experienced a recurrence (not censored); c <- as.numeric(is.na(bladder$t1)==F) time <- pmin(bladder$futime, bladder$t1, na.rm=T) bladder <- cbind(bladder, c, time) head(bladder) ## first observation has 0 follow-up time, not used in any analysis, ## drop first as it gives errors bladder <- bladder[-1,] ## Now using Zelig to estimate and compare first recurrence time between treatment and control groups z.out <- zelig(Surv(time, c) ~ group, model = "exp", data = bladder) summary(z.out) ## Defining covariates for control and treated groups x.control <- setx(z.out, group = 1) x.treated <- setx(z.out, group = 2) ## To find a hazard ratio for an exponential model ## Note that the expected value = 1/hazard ## Then, to find hazard ratio is to find the ratio of the 1/expected value s.out.control <- sim(z.out, x = x.control) summary(s.out.control) ## E(Y/X) = 26.73509 ## Hazard = 1/26.73509 = 0.03740403 s.out.treated <- sim(z.out, x = x.treated) summary(s.out.treated) ## E(Y/X) = 45.64097 ## Hazard = 1/45.64097 = 0.02191014 ## Then, the hazard ratios of hazard of the control to the hazard of the ## treated group = 0.03740403/0.02191014 = 1.707156 ## Finding other quantity of interest: The difference between the time to first recurrence ## between the control and treated groups ## Simulation to find expected values and first differences s.out <- sim(z.out, x = x.control, x1 = x.treated) summary(s.out) plot(s.out) ## if you wanted to pull out the individual expected values: expected.treated<-mean(s.out.treated$qi$ev) expected.treated expected.control<-mean(s.out.control$qi$ev) expected.control hazard.treated<-1/expected.treated hazard.treated hazard.control<-1/expected.control hazard.control Hazard.Ratio<-hazard.control/hazard.treated Hazard.Ratio #### Contributors: Maxwell Palmer & Andrew Hall #### April 20, 2011 #### Weibull Models #### Single-Equation Models: No pg. # ############################################################################################### # Show how to estimate a weibull model in R and produce Hazard ratios and other quantities of # # interest (using Zelig) # # Note that the code and example here comes straight from # # http://cran.r-project.org/web/packages/Zelig/vignettes/weibull.pdf # # I merely add expository comments. # ############################################################################################### rm(list=ls(all=TRUE)) library(Zelig) ## Use sample dataset from King, Alt, Burns, and Laver (1990) data(coalition) ## Data description from original paper: ## "This data set contains survival data on government coalitions in parliamentary democracies ## (Belgium, Canada, Denmark, Finland, France, Iceland, Ireland, Israel, Italy, Netherlands, Norway, ## Portugal, Spain, Sweden, and the United Kingdom) for the period 1945-1987. For parsimony, country ## indicator variables are omitted in the sample data." ## We want to estimate a model in which survival times of coalitions follow a Weibull distribution. ## Our dependent variable is the observed survival time of a coalition. ## However, this is censored data because there is a known maximum duration ## governed by the country's election laws. These laws create what the authors call a ## "constitutional interelection period (CIEP)" which induces right-censoring on ## the duration observations. ## We can see how many right-censored observations we have: table(coalition$ciep12) ## When ciep12 is 1, the coalition survived to the twelve months before the CIEP ## and therefore the duration value is censored. The true duration could be ## larger than the observed. ## Therefore we need to modify our dependent variable using Surv() ## The call to Surv() creates the "survival object", which takes into account the ## censored nature of the data. ## You pass in your time variable (duration here) and your event variable (ciep12 here) ## and you get back a Surv object you can use for survival models. ## Then you run zelig with the Surv object as the dependent variable. ## Given the Weibull model, zelig will find MLE estimates for your explanatory variables ## fract and numst2. ## fract reflects the number/size of parties in the parliament, with higher values meaning ## more fractionalization. numst2 is a dummy, 1 if majority government, 0 if minority government. z.out <- zelig(Surv(duration, ciep12) ~ fract + numst2, model="weibull", data=coalition) ## Now we have our estimated model, so we can make simulations from it. ## For example, here we simulate to find the effect of being a majority government ## vs. being a minority ## government on the survival of the coalition. x.low <- setx(z.out, numst2=0) x.high <- setx(z.out, numst2=1) s.out <- sim(z.out, x=x.low, x1=x.high) summary(s.out) plot(s.out) #### Contributors: Colin Sullivan & Todd Kawakita #### April 20, 2011 #### Cox Proportional Hazards Models #### Single-Equation Models: No pg. # ############################################################################################### # This brief guide will demonstrate Cox proportional hazards model in Zelig. We will use # # data from Zelig and some fancy footwork to estimate a hazards model and produce quantities # # of interest. This draws heavily on examples in the Zelig manual (written by Patrick Lam # ############################################################################################### rm(list=ls(all=TRUE)) ## Load the Zelig Package library(Zelig) ## Load new data - ## We will be looking at a dataset in Zelig called "coalition2" - Coalition Dissolution in ## Parliamentary Democracies, Modified Version data(coalition2) ## Let's have a look at the data: head(coalition2) summary(coalition2) ## Variables: ## duration - The length of duration in months of the cabinet coalition ## ciep12 - Binary variabel - did the coalition conclude its constitutional interelection ## period, or maximum length of duration? ## invest - Binary - Is there a legal requirement for legislative investiture? ## fract - An index of the number and size of parties in parliament ## polar - Measures polarization - support for extremist parties ## numst2 - Binary - Is this a majority government? ## crisis - The length of crisis preceding government formation (in days) ## country - No, this is not a trick. Name of country ## So, how long do coalitions last? plot(density(coalition2$duration, bw = 1.5), xlim=c(0, 70), main = "Duration Density", xlab="Cabinet Duration in Months") abline(v=mean(coalition2$duration)) ## Estimating the model ## Let's start by looking at just one covariate: invest ## The syntax: z.out <- zelig(Surv(Y, C) ~ X1 + X2, model = "coxph", data = mydata) ## where Y is the survival measure (duration) and C is the censoring mechanism (ciep12), ## and X1 and X2 are your covariates ex1 <- zelig(Surv(duration, ciep12) ~ invest, model = "coxph", data = coalition2) summary(ex1) ## Note that if your censorship mechanism is missing, Zelig will assume all observations are censored! ex1.b <- zelig(Surv(duration) ~ invest, model = "coxph", data = coalition2) summary(ex1.b) ## Running the model with several covariates ex2 <- zelig(Surv(duration, ciep12) ~ invest + fract + polar + numst2 + crisis, model = "coxph", data = coalition2) summary(ex2) ## Generating quantities of interest ## Imagine you were interested in the impact of moving from the first to the third quartile ## of fractionalization. First, we find the quartiles. Next, we assign those values to ## the two sets of covariates, and hold all other covariates at their medians. summary(coalition2$fract) x.lowfrac <- setx(ex2, fract = 677) x.highfrac <- setx(ex2, fract = 788) ## Next, we SIMULATE!!! diff.frac <- sim(ex2, x = x.lowfrac, x1 = x.highfrac) plot(diff.frac) ## VARIATIONS ON A THEME ## Stratified Cox Proportional Hazards Model ## What's the difference? Baseline hazards vary, but coefficents are the same, across strata. ex3.strat <- zelig(Surv(duration, ciep12) ~ invest + fract + polar + numst2 + strata(crisis), model = "coxph", data = coalition2) x.lowstrat <- setx(ex3.strat, numst2 = 0, strata = "crisis=1") x.highstrat <- setx(ex3.strat, numst2 = 1, strata = "crisis=1") diff.strat <- sim(ex3.strat, x = x.lowstrat, x1 = x.highstrat) summary(diff.strat) ## Cox Proportional Hazards Model with Time-Varying Covariates ## What's the difference? Let's assume that right now, you are not wrestling a bear and ## you have some predicted survival duration. If you someday find yourself wrestling a bear, ## your predicted survival time might decrease... slightly. Zelig allows your model to ## include variables whose value changes for a given observation over time. ## Your data should be long - that is, you should have multiple observations for ## each case with time-varying variables. If you are using quarterly unemployment ## to predict how long your goldfish survive, you will have one "q_unempl" variable and ## one observation for every quarter that your golfish is still alive. ## The intervals are not required to be regular (quarterly) - they can be any length, and ## the start and stop times of each period must be defined for each observation. ## The function format is as follows: ## z.out <- zelig(Surv(start, stop, censoring) ~ X1 + X2 + ..., model="coxph", data=data) #### Contributors: Jeff Javed and Noam Gidron #### April 20, 2011 #### Zero-Inflated Negative Binomial Regression #### Single-Equation Models: No pg. # ################################################################################################# # We wish to show how to construct a zero-inflated negative binomial model from scratch. First, # # this code constructs a function for the log-likelihood of the zero-inflated negative binomial # # model. Next, it estimates the log-likelihood using a data set on fishing (Available at: # # http://www.ats.ucla.edu/stat/R/dae/fish.csv). Finally, it runs the same data through a canned # # zero-inflated negative binomial function from the pscl library to check for accuracy. # # # # Explanation of the Zero-Inflated Negative Binomial # # The zero-inflated negative binomial model is used for event counts where we are worried that # # we have overdispersion due to a large amount of structural zeroes in our dependent variable. # # Thus, if y=0, then there is a possibility pi that we will get a structural 0 and a possibility# # 1 - pi that the zero was drawn from a negative binomial distribution. Thus, the probability # # distribution for P(Y=y|lambda, theta, pi) is: # # (pi + (1-pi)*negbinomial(0|lambda,theta))^d * (1-pi)*negbinomial(y|lambda, theta)^(1-d) # # where d = 1 when y = 0, and d = 0 when y > 0. # # # # NB: This implementation may be a bit unstable at the gamma(theta) stage. A production-quality# # implementation would use the lgamma() function (as done in the pscl package). Here we use a # # simpler coding for pedagogical purposes. # # # ################################################################################################# rm(list=ls(all=TRUE)) ## Start a function that takes in parameters, a matrix of explanatory variables (X), ## a matrix for the zero-inflated variables (Z), and the dependent variable (y). ll.zibinom <- function(par, X, Z, y){ #Set beta, gamma, and theta. Beta will contain as many parameters as are in the X matrix, #while gamma will contain as many parameters as are in the Z matrix. Theta is a single parameter. beta <- par[1:ncol(X)] theta <- par[(ncol(X) + 1)] gamma <- par[(ncol(X) + 2):length(par)] #Set lambda. lambda <- exp(X%*%beta) #Set pi. pie <- 1/(1+exp(-Z%*%gamma)) #Create the probability density function for a negative binomial if y > 0. negbinom <- (gamma(theta+y)/((factorial(y))*gamma(theta)))*((lambda^(y))*(theta^(theta)))/ ((lambda + theta)^(theta+y)) #Create the probability density function for a negative binomial if y = 0. negbinom.zero <- (theta^(theta))/((lambda + theta)^(theta)) #Set d so that d = 1 when y = 0, and d = 0 when y > 0. #This will differentiate between instances where y = 0 and y > 0. d <- ifelse(y==0, 1, 0) #Write out the log-likelihood for the zero-inflated negative binomial function. sum((d)*log(pie + (1-pie)*(negbinom.zero)) + (1-d)*log((1-pie)*negbinom)) } ## Estimating the Model ## We use data about the probability that a group of people in a park caught a fish. ## The explanatory variables are the number of children in the group, the number of people in the group, ## and whether or not the group brought a camper to a park. The dependent variable is the number of ## fish caught. The data and example are from UCLA Academic Technology Services. ## Available at: http://www.ats.ucla.edu/stat/stata/dae/zinb.htm. ## Load and view data. load("Gov2001CodeLibrary.RData") data <- fish head(data) ## The fish count is our dependent variable y. y <- data$count ## The child and camper variables are the independent variables (X). We add a column of 1s ## so that we can calculate an intercept. X <- as.matrix(cbind(1, data$child, data$camper)) ## The zero-inflated variable is number of people in a group. Again, we add a column of 1s ## so that we can calculate an intercept. Z <- as.matrix(cbind(1, data$persons)) ## Optimize the function. Number of parameters will equal the number of columns in X and Z plus 1. out <- optim(par=rep(0.01, ncol(X) + ncol(Z) + 1), ll.zibinom, Z=Z, X=X,y=y, method="BFGS", control=list(fnscale=-1), hessian=TRUE) ## Extract parameters. out$par ## Extract log-likelihood. out$value ## Compare to the Canned Function ## Load the pscl library. library(pscl) ## Run the zero-inflated binomial model. Place the Z matrix variables after the |. zinb <- zeroinfl(count ~ child + camper | persons, dist = "negbin", EM=TRUE, data=data) ## Check and compare results. Note that zeroinfl provides the log(theta), while our function ## provides an unlogged theta. summary(zinb) out$par out$value