# Saving plots -------------------- # Start the .jpeg driver jpeg("your_plot.jpeg") # Make the plot plot(x = 1:10, y = 1:10) # Turn off the driver dev.off() # Start the .pdf driver pdf("your_plot.pdf") # Make the plot plot(x = 1:10, y = 1:10) # Turn off the driver dev.off() # Overfull \hbox -------------------- # Original line library(magrittr) x <- rnorm(100) %>% matrix(ncol = 4) %>% tbl_df() %>% mutate(V5 = V1 * V2) # Bad job splitting line library(magrittr) x <- rnorm(100) %>% matrix(ncol = 4) %>% tbl_df() %>% mutate(V5 = V1 * V2) # Good job splitting line #1 library(magrittr) x <- rnorm(100) %>% matrix(ncol = 4) %>% tbl_df() %>% mutate(V5 = V1 * V2) # Good job splitting line #2 library(magrittr) x <- rnorm(100) %>% matrix(ncol = 4) %>% tbl_df() %>% mutate(V5 = V1 * V2) # Logical operators -------------------- # I'm not lying T == TRUE identical(T, TRUE) # A nuance TRUE == "TRUE" identical(TRUE, "TRUE") # Greater/less than 1 > 3 1 < 1 1 >= 3 1 <= 1 # Alphabetization "Ed" < "Everyone" # :( "A" < "B" # NA is weird NA == NA NA > 3 NA == T is.na(NA) # Equals T == F (pi > 1) == T # And (3 > 2) & (2 > 3) # Or (3 > 2) | (2 > 3) # Not (gives the opposite) ! T # Optional arguments -------------------- b_ols <- function(data, y_var, X_vars, intercept = TRUE) { # Require the 'dplyr' package require(dplyr) # Create the y matrix y <- data %>% # Select y variable data from 'data' select_(.dots = y_var) %>% # Convert y_data to matrices as.matrix() # Create the X matrix X <- data %>% # Select X variable data from 'data' select_(.dots = X_vars) # If 'intercept' is TRUE, then add a column of ones # and move the column of ones to the front of the matrix if (intercept == T) { X <- data %>% # Add a column of ones to X_data mutate_("ones" = 1) %>% # Move the intercept column to the front select_("ones", .dots = X_vars) } # Convert X_data to a matrix X <- as.matrix(X) # Calculate beta hat beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y # If 'intercept' is TRUE: # change the name of 'ones' to 'intercept' if (intercept == T) rownames(beta_hat) <- c("intercept", X_vars) # Return beta_hat return(beta_hat) } # Load the 'dplyr' package library(dplyr) # Set the seed set.seed(12345) # Set the sample size n <- 100 # Generate the x and error data from N(0,1) the_data <- tibble( x = rnorm(n), e = rnorm(n)) # Calculate y = 3 + 1.5 x + e the_data <- mutate(the_data, y = 3 + 1.5 * x + e) # Plot to make sure things are going well. plot( # The variables for the plot x = the_data$x, y = the_data$y, # Labels and title xlab = "x", ylab = "y", main = "Our generated data") # Run b_ols with and intercept b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T) b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = F) library(lfe) # felm with an intercept: felm(y ~ x, data = the_data) %>% summary() # felm without an intercept: felm(y ~ x - 1, data = the_data) %>% summary() # The estimates b_w <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = T) b_wo <- b_ols(data = the_data, y_var = "y", X_vars = "x", intercept = F) # Plot the points plot( # The variables for the plot x = the_data$x, y = the_data$y, # Labels and title xlab = "x", ylab = "y", main = "Our generated data") # Plot the line from the 'with intercept' regression in yellow abline(a = b_w[1], b = b_w[2], col = "lightblue", lwd = 3) # Plot the line from the 'without intercept' regression in purple abline(a = 0, b = b_w[1], col = "purple4", lwd = 2) # Add a legend legend(x = min(the_data$x), y = max(the_data$y), legend = c("with intercept", "w/o intercept"), # Line widths lwd = c(3, 2), # Colors col = c("lightblue", "purple4"), # No box around the legend bty = "n") # FWL -------------------- # Options options(stringsAsFactors = F) # Load the packages library(dplyr) library(lfe) library(readr) library(MASS) # Set the working directory setwd("/Users/edwardarubin/Dropbox/Teaching/ARE212/Section04/") # Load the dataset from CSV cars <- read_csv("auto.csv") to_matrix <- function(the_df, vars) { # Create a matrix from variables in var new_mat <- the_df %>% # Select the columns given in 'vars' select_(.dots = vars) %>% # Convert to matrix as.matrix() # Return 'new_mat' return(new_mat) } b_ols <- function(data, y_var, X_vars, intercept = TRUE) { # Require the 'dplyr' package require(dplyr) # Create the y matrix y <- to_matrix(the_df = data, vars = y_var) # Create the X matrix X <- to_matrix(the_df = data, vars = X_vars) # If 'intercept' is TRUE, then add a column of ones if (intercept == T) { # Bind a column of ones to X X <- cbind(1, X) # Name the new column "intercept" colnames(X) <- c("intercept", X_vars) } # Calculate beta hat beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y # Return beta_hat return(beta_hat) } b_ols(the_data, y_var = "y", X_vars = "x", intercept = T) b_ols(the_data, y_var = "y", X_vars = "x", intercept = F) resid_ols <- function(data, y_var, X_vars, intercept = TRUE) { # Require the 'dplyr' package require(dplyr) # Create the y matrix y <- to_matrix(the_df = data, vars = y_var) # Create the X matrix X <- to_matrix(the_df = data, vars = X_vars) # If 'intercept' is TRUE, then add a column of ones if (intercept == T) { # Bind a column of ones to X X <- cbind(1, X) # Name the new column "intercept" colnames(X) <- c("intercept", X_vars) } # Calculate the sample size, n n <- nrow(X) # Calculate the residuals resids <- (diag(n) - X %*% solve(t(X) %*% X) %*% t(X)) %*% y # Return 'resids' return(resids) } # Working through FWL -------------------- # Steps 1 and 2: Residualize 'price' on 'weight' and an intercept e_yx <- resid_ols(data = cars, y_var = "price", X_vars = "weight", intercept = T) # Steps 3 and 4: Residualize 'mpg' on 'weight' and an intercept e_xx <- resid_ols(data = cars, y_var = "mpg", X_vars = "weight", intercept = T) # Combine the two sets of residuals into a data.frame e_df <- data.frame(e_yx = e_yx[,1], e_xx = e_xx[,1]) # Step 5: Regress e_yx on e_xx without an intercept b_ols(data = e_df, y_var = "e_yx", X_vars = "e_xx", intercept = F) b_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight")) # Orthogonal covariates -------------------- # Set the seed set.seed(12345) # Set the sample size n <- 1e5 # Generate x1, x2, and error from ind. N(0,1) the_data <- tibble( x1 = rnorm(n), x2 = rnorm(n), e = rnorm(n)) # Calculate y = 1.5 x1 + 3 x2 + e the_data <- mutate(the_data, y = 1.5 * x1 + 3 * x2 + e) # Regression omitting 'x2' b_ols(the_data, y_var = "y", X_vars = "x1", intercept = F) # Regression including 'x2' b_ols(the_data, y_var = "y", X_vars = c("x1", "x2"), intercept = F) # Orthogonal covariates -------------------- # Load the MASS package library(MASS) # Create a var-covar matrix v_cov <- matrix(data = c(1, 0.95, 0.95, 1), nrow = 2) # Create the means vector means <- c(5, 10) # Define our sample size n <- 1e5 # Set the seed set.seed(12345) # Generate x1 and x2 X <- mvrnorm(n = n, mu = means, Sigma = v_cov, empirical = T) # Create a tibble for our data, add generate error from N(0,1) the_data <- tbl_df(X) %>% mutate(e = rnorm(n)) # Set the names names(the_data) <- c("x1", "x2", "e") # The data-generating process the_data <- the_data %>% mutate(y = 1 + 2 * x1 + 3 * x2 + e) # Regression 1: y on int, x1, and x2 b_ols(the_data, y_var = "y", X_vars = c("x1", "x2"), intercept = T) # Regression 2: y on int and x1 b_ols(the_data, y_var = "y", X_vars = "x1", intercept = T) # Regression 3: y on int and x2 b_ols(the_data, y_var = "y", X_vars = "x2", intercept = T) # Bad controls -------------------- # Create a var-covar matrix v_cov <- matrix(data = c(1, 1 - 1e-6, 1 - 1e-6, 1), nrow = 2) # Create the means vector means <- c(5, 5) # Define our sample size n <- 1e5 # Set the seed set.seed(12345) # Generate x1 and x2 X <- mvrnorm(n = n, mu = means, Sigma = v_cov, empirical = T) # Create a tibble for our data, add generate error from N(0,1) the_data <- tbl_df(X) %>% mutate(e = rnorm(n)) # Set the names names(the_data) <- c("x1", "x2", "e") # The data-generating process the_data <- the_data %>% mutate(y = 1 + 2 * x1 + e) # Regression 1: y on int and x1 (DGP) b_ols(the_data, y_var = "y", X_vars = "x1", intercept = T) # Regression 2: y on int, x1, and x2 b_ols(the_data, y_var = "y", X_vars = c("x1", "x2"), intercept = T) # Regression 3: y on int and x2 b_ols(the_data, y_var = "y", X_vars = "x2", intercept = T) # R2 family -------------------- # Start the function r2_ols <- function(data, y_var, X_vars) { # Create y and X matrices y <- to_matrix(data, vars = y_var) X <- to_matrix(data, vars = X_vars) # Add intercept column to X X <- cbind(1, X) # Find N and K (dimensions of X) N <- nrow(X) K <- ncol(X) # Calculate the OLS residuals e <- resid_ols(data, y_var, X_vars) # Calculate the y_star (demeaned y) y_star <- demeaner(N) %*% y # Calculate r-squared values r2_uc <- 1 - t(e) %*% e / (t(y) %*% y) r2 <- 1 - t(e) %*% e / (t(y_star) %*% y_star) r2_adj <- 1 - (N-1) / (N-K) * (1 - r2) # Return a vector of the r-squared measures return(c("r2_uc" = r2_uc, "r2" = r2, "r2_adj" = r2_adj)) } # Our r-squared function r2_ols(data = cars, y_var = "price", X_vars = c("mpg", "weight")) # Using 'felm' felm(price ~ mpg + weight, cars) %>% summary() # Overfitting -------------------- # Find the number of observations in cars n <- nrow(cars) # Fill an n-by-50 matrix with random numbers from N(0,1) set.seed(12345) rand_mat <- matrix(data = rnorm(n * 50, sd = 10), ncol = 50) # Convert rand_mat to tbl_df rand_df <- tbl_df(rand_mat) # Change names to x1 to x50 names(rand_df) <- paste0("x", 1:50) # Bind cars and rand_df; convert to tbl_df; call it 'cars_rand' cars_rand <- cbind(cars, rand_df) %>% tbl_df() # Drop the variable 'rep78' (it has NAs) cars_rand <- dplyr::select(cars_rand, -rep78, -make) result_df <- lapply( X = 2:ncol(cars_rand), FUN = function(k) { # Apply the 'r2_ols' function to columns 2:k r2_df <- r2_ols( data = cars_rand, y_var = "price", X_vars = names(cars_rand)[2:k]) %>% matrix(ncol = 3) %>% tbl_df() # Set names names(r2_df) <- c("r2_uc", "r2", "r2_adj") # Add a column for k r2_df$k <- k # Return r2_df return(r2_df) }) %>% bind_rows() # Plot unadjusted r-squared plot(x = result_df$k, y = result_df$r2, type = "l", lwd = 3, col = "lightblue", xlab = expression(paste("k: number of columns in ", bold("X"))), ylab = expression('R' ^ 2)) # Plot adjusted r-squared lines(x = result_df$k, y = result_df$r2_adj, type = "l", lwd = 2, col = "purple4") # The legend legend(x = 0, y = max(result_df$r2), legend = c(expression('R' ^ 2), expression('Adjusted R' ^ 2)), # Line widths lwd = c(3, 2), # Colors col = c("lightblue", "purple4"), # No box around the legend bty = "n")