################################################################## ################################################################## # Gov 2001/1002/E-2001 Section 1 # 1-29-14 # Soledad Artiz Prillaman ################################################################## ################################################################## # Clear our workspace rm(list=ls()) #################################### ############ Simulation ############ #################################### ###### # Simulating from distributions ## R has a bunch of canned functions which make this easy ## Say we want to look at the distribution of salaries ## and we assume that salaries are normally distributed ## with mean 40000 and standard deviation 10000 ## It may be hard to conceptualize this distribution, ## So we can sample from it using the rnorm() command ## n is the sample size salaries <- rnorm(n=1000000 , mean=40000 , sd=10000) ## Plot the distribution plot(density(salaries)) ## Now, what if we wanted to know the density of the distribution ## i.e. the height of the distribution ## when x = 20000 ## dnorm provides the PDF of the normal distribution dnorm(20000, mean=40000, sd=10000) ## Or maybe we want to know the probability that a salary is ## is less than 20000 ## pnorm provides the CDF of the normal distribution pnorm(20000, mean=40000, sd=10000) ## Or maybe we want to know what salary marks the 95% percentile ## qnorm provides the inverse CDF of the normal distribution qnorm(0.95, mean=40000, sd=10000) ## this tells us that under our assumed model, ## 95% of people earn a salary of less than roughly $56000 ## There are many other distributions which you can sample from in a similar way! #################################### ## EXPECTATION VIA SIMULATION # Ex- You are waiting for the red line. How long with it take # for the T to get to you? # X can be {1,2,3,4,...} minutes # X is a geometric random variable measuring the number of minutes # until the train gets to you (including the minute it gets to you) # Assume that the probability that the T comes in any given minute is # pi = .2 # Now say we want to know the expected value of X - i.e. the expected # amount of time you are going to wait for the train # We can use simulation to approximate this value # Set the seed so that you can replicate your results # doesn't matter what it is set to set.seed(02138) # draw 100000 observations from a geometric distribution draws <- rgeom(n = 100000, prob = .2) # and then find the mean of these to approximate the expected value mean(draws) # NOTE: For some versions of R, rgeom does not include the minute you arrive # i.e. rather than being the number of failures (minutes the train doesn't come) # and the first success it is the number of failures before the first success # So we have to add 1 to rgeom to get the correct distribution - each value it gives us # is the number of minutes the train did not show up and then we add 1 for the minute # it did show up set.seed(02138) # draw 100000 observations from a geometric distribution draws <- 1 + rgeom(n = 100000, prob = .2) # and then find the mean of these to approximate the expected value mean(draws) # What if instead you wanted to know the expected value of a function # Say E[sqrt(1 + X)] # We can calculate this by simulating X, running it through the function g(x) and finding the mean # We can do this because of LOTUS which says that E[g(X)] = \sum_x g(x)P(X=x) # 1. Draw X from a geometric set.seed(02138) draws <- 1 + rgeom(n = 100000, prob = .2) # 2. Calculate g(x) for each draw g.draws <- sqrt(1 + draws) # Find the mean of this mean(g.draws) ####### ## Ex - Lets assume we have the same question but we want to model time as continuous, # i.e. the train can arrive at any point, not just on exact minutes # X can be any value greater than 0 # X is an exponential random variable # Assume that the rate at which X occurs is .25, i.e. lambda = .25 # Again we want to find the expected value of X using simulation # Set the seed set.seed(2001) # draw 100000 observations from an exponential distribution draws <- rexp(n = 100000, rate = .25) # and then find the mean of these to approximate the expected value mean(draws) # Again we want E[sqrt(1 + X)] # 1. Draw X from an exponential set.seed(2001) draws <- rexp(n = 100000, rate = .25) # 2. Calculate g(x) for each draw g.draws <- sqrt(1 + draws) # Find the mean of this mean(g.draws) #################################### ## PROBABILITY VIA SIMULATION # Ex- We have an urn with five red balls and five green balls # What is the probability of drawing four balls of the same color? # 1. Construct our population urn <- c("G","G","G","G","G","R","R","R","R","R") urn # Or urn <- c(rep("red",5),rep("green",5)) urn # Or urn <- c(rep(1,5),rep(0,5)) urn # 2. Take one sample set.seed(1217) # sample 4 elements from the vector urn without replacement draw <- sample(x = urn, size = 4, replace = FALSE) draw # 3. Determine if a success or failure sum(draw) == 4 sum(draw) == 0 # specify that it is a success if the vector adds to 1 or 0 success <- (sum(draw) == 4) | (sum(draw) == 0) success # 4. Repeat with a for-loop set.seed(1217) # Specify the number of times you want to repeat sims <- 100000 # Create an empty vector to store your output success <- NULL # Repeat steps 2-3 over and over and store your results from each repetition # in the vector success for(i in 1:sims){ draw <- sample(x = urn, size = 4, replace = FALSE) success[i] <- sum(draw) == 4 | sum(draw) == 0 } head(success) # (4. Repeat with replicate) # Write a function that takes in a vector of draws (our sample of 4 balls) # and returns whether it was a success or not eval <- function(draw){ success <- sum(draw) == 4 | sum(draw) == 0 return(success) } set.seed(1217) sims <- 10000000 # Replicate teh function over and over and store in a new vector success <- replicate(sims, eval(sample(urn, 4, replace=F))) head(success) # Exactly the same as our for-loop! (Note: this is because we set it to the same seed) # 5. Determine proportion of success sum(success)/sims # Or mean(success) #################################### ############ FUNCTIONS ############# #################################### ######### QUIZ ########### ## Write a function that takes in a vector x and a vector y # and reports the r^2 from a regression of x on y # do not use the lm() command at all! # This function takes as its inputs a vector of observations for the single covariate x # and a vector of observations for the y # It outputs a single value of the r2 r2.function <- function(x,y){ # Find our estimate of b1 using the first normal equation b1 <- sum((x-mean(x))*y)/sum((x-mean(x))^2) # Find our estimate of b0 using the second normal equation b0 <- mean(y) - b1*mean(x) # using our estimates of b1 and b0 find our yhat yhat <- b0 + b1*x # use these in the formula for r2 r2 <- sum((yhat-mean(y))^2)/sum((y-mean(y))^2) return(r2) }