##################################################################
##################################################################
# Gov 2001/1002/E-2001 Section 12
# 4-23-14
# Soledad Artiz Prillaman
##################################################################
##################################################################

# Clear our workspace
rm(list=ls())

library(Zelig)
library(systemfit)
library("ZeligChoice")

###################################################
###### Seemingly Unrelated Regression Models ######
###################################################

# Load the grunfeld data from the Zelig packages
data(grunfeld)
names(grunfeld)
summary(grunfeld)


# Run the Seemingly Unrelated Regression for GE investment and 
# Westinghouse investment
# Store our two equations then run model
# To run this model we can use the systemfit package
formula <- list(mu1 = Ige ~ Fge + Cge, 
                mu2 = Iw ~ Fw + Cw)
sur.out <- systemfit(formula = formula, 
               method = "SUR", data = grunfeld)
sur.out

# To get the variance - covariance matrix 
vcov.sur<-vcov(sur.out)
vcov.sur
# We can see that there are dependencies in our parameters across models



#######################################
###### Multinomial Choice Models ######
#######################################

# Load the mexico data from the Zelig package
# This has data on vote choice across three parties in Mexico
data(mexico)
names(mexico)
summary(mexico$vote88)


# Let's run a multinomial logit model of voting behavior
# Our covariates will be age and gender
# Since vote choice is a categorical variable, we need to factorize it
# to get a matrix of y's
ml.out <- zelig(as.factor(vote88) ~ age + female, 
               model = "mlogit", data = mexico)
ml.out$result

#Let's look at the difference in expected vote choice for the youngest and the oldest
x.young <- setx(ml.out, age = min(mexico$age)) 
x.old<- setx(ml.out, age = max(mexico$age))
ml.sim <- sim(ml.out, x1 = x.old, x = x.young)
summary(ml.sim)

# Let's look at our results
plot(ml.sim)



##########################
###### Missing Data ######
##########################

# Let's load the Amelia package and the africa data in this package
library(Amelia)
data(africa)
names(africa)
summary(africa) # we have a couple of missing values in gdp pc and trade

africa[1,]
year      country gdp_pc  infl trade civlib population
1 1972 Burkina Faso    377 -2.92 29.69    0.5    5848380

# Let's run our imputation
# x specifies the data to be imputed
# all variables in x will be used as predictors in the imputation model
# cs and ts specify if cross-sectional or time-series
# logs specifies variables which need to be logged
# m specifies the number of imputed data sets
# the greater m is, the less likely you are to have outliers
set.seed(1234)
a.out <- amelia(x = africa, cs = "country", ts = "year", 
                logs = "gdp_pc", m = 5)

# If you wanted to save these 5 data sets, you could:
write.amelia(obj=a.out, file.stem = "a.out")


# Now, we just want to estimate a basic regression model with our imputed data
# But we have 5 datasets!
# Zelif will combine them for us:
z.out.imp <- zelig(trade ~ log(population) + log(gdp_pc) + infl 
                   + civlib, data = a.out$imputations, model = "ls") 
summary(z.out.imp)

### Diagnostics

# The missingness map shows us where our missingness is
missmap(a.out)

# We can plot our amelia output to compare empirical and imputed densities
plot(a.out)

# There are time-series, cross-sectional plots with imputed values
tscsPlot(a.out, var = "trade", cs = "Cameroon")

#Overimputation for a specific variable tests the imputation model by 
# imagining that each observation is missing and generating some imputations to check performance.   
overimpute(a.out, var = "trade")


#The disperse function starts the algorithm at some unlikely values to check 
# that amelia hasn't found a local rather a global maximum for the likelihood of the complete data.  
disperse(a.out, dims = 1, m = 5)