This program utilizes causal inference methods to estimate the
impact of a bad event. Specifically, we estimate the 3-month impact of
the 2017 Equifax data breach on stock prices using a synthetic control
of competitor and market ETF prices.
Importantly, this skill is highly transferable to other domains. For
example, we could estimate the impact of expanding marketing efforts to
a new region/country in a non-experimental setting using other untreated
countries as a synthetic control.
install.packages("dplyr")
trying URL 'http://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.4/dplyr_1.1.4.tgz'
Content type 'application/x-gzip' length 1606367 bytes (1.5 MB)
==================================================
downloaded 1.5 MB
The downloaded binary packages are in
/var/folders/rq/qpfnngq15wj18l5t6y2f0gdm0000gn/T//RtmpRTyaLc/downloaded_packages
install.packages("ggplot2")
trying URL 'http://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.4/ggplot2_3.5.1.tgz'
Content type 'application/x-gzip' length 4975729 bytes (4.7 MB)
==================================================
downloaded 4.7 MB
The downloaded binary packages are in
/var/folders/rq/qpfnngq15wj18l5t6y2f0gdm0000gn/T//RtmpRTyaLc/downloaded_packages
install.packages("Synth")
trying URL 'http://cran.rstudio.com/bin/macosx/big-sur-x86_64/contrib/4.4/Synth_1.1-8.tgz'
Content type 'application/x-gzip' length 132221 bytes (129 KB)
==================================================
downloaded 129 KB
The downloaded binary packages are in
/var/folders/rq/qpfnngq15wj18l5t6y2f0gdm0000gn/T//RtmpRTyaLc/downloaded_packages
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(Synth)
##
## Synth Package: Implements Synthetic Control Methods.
## See https://web.stanford.edu/~jhain/synthpage.html for additional information.
library(ggplot2)
FUNCTIONS CALCULATE MEAN GAP BETWEEN OBSERVED VALUE AND SYNTHETIC
CONTROL FOR DEFINED PERIODS
evaluate_gaps <- function(dataprep.out, pre_start, pre_end, post_length) {
Y1 <- dataprep.out$Y1plot
Y0_weighted <- dataprep.out$Y0plot %*% synth.out$solution.w
# Calculate the gap
gap <- Y1 - Y0_weighted
# Combine them into a dataframe
result_df <- data.frame(
Y1 = as.vector(Y1),
Y0_weighted = as.vector(Y0_weighted),
gap = as.vector(gap)
)
result_df$index <- as.numeric(rownames(result_df))
if(nrow(result_df) == (post_end-pre_start+1)) {
post_start <- nrow(result_df) - post_length + 1
pre_df <- result_df[result_df$index < post_start, ]
post_df <- result_df[result_df$index >= post_start, ]
print(mean(pre_df$gap))
print(mean(post_df$gap))
}
else {
print("Inconsistent length of results")
}
}
PULL IN PANEL DATA ON STOCK PRICES FROM YAHOO FINANCE
prices <- read.csv('equifax_breach_prices.csv')
prices$date <- as.Date(prices$date)
efx <- prices %>% filter(ticker=='EFX')
ggplot(efx, aes(x = date, y = price, color = ticker)) +
geom_line() +
labs(title = "Stock Prices Over Time", x = "Date", y = "Price", color = "Ticker") +
theme_minimal()
ggplot(prices, aes(x = date, y = price, color = ticker)) +
geom_line(aes(size = ifelse(ticker == "EFX", "EFX", "Others"))) +
scale_size_manual(values = c("EFX" = 1.5, "Others" = 0.5), guide='none') +
labs(title = "Stock Prices Over Time", x = "Date", y = "Price", color = "Ticker") +
theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
PREPARE DATA FOR SYNTHETIC CONTROL: ASSIGN SEQUENTIAL IDs FOR TICKER
SYMBOLS AND TRADING DATES
unique_dates <- prices %>%
select(date) %>%
distinct() %>%
arrange(date) %>%
mutate(period = row_number())
unique_tickers <- prices %>%
select(ticker) %>%
distinct() %>%
mutate(tickerID = ifelse(ticker=='EFX', 1, NA)) %>%
arrange(tickerID, ticker) %>%
mutate(tickerID = ifelse(is.na(tickerID), row_number(), tickerID))
df <- prices %>%
left_join(unique_dates, by = 'date')
df <- df %>%
left_join(unique_tickers, by = 'ticker')
# Convert group_id to integer
df$tickerID <- as.integer(df$tickerID)
df$period <- as.integer(df$period)
other_columns <- setdiff(names(df), c("tickerID","period"))
df <- df[, c("tickerID","period", other_columns)]
df <- as.data.frame(df)
df <- df %>% filter(!is.na(returns))
DEFINE PARAMETERS AND ESTIMATE SYNTHETIC CONTROL (~22 TRADING DAYS
PER MONTH)
event_date <- 215
post_length <- 22*3
pre_length <- 22*3
pre_end <- event_date - 1
pre_start <- pre_end - pre_length
post_end <- pre_end + post_length
df1 <- df %>%
filter((period >= pre_start) & (period <= post_end))
dataprep.out <- dataprep(foo = df1,
time.predictors.prior = pre_start:pre_end,
predictors = c('price','returns'),
predictors.op = 'mean',
dependent = 'price', # dv
unit.variable = 'tickerID', #identifying unit numbers
unit.names.variable = 'ticker', #identifying unit names
time.variable = 'period', #time-periods
treatment.identifier = 1, #the treated case
controls.identifier = 2:8, #the control cases; all others except target
time.optimize.ssr = pre_start:pre_end, #the time-period over which to optimize
time.plot = pre_start:post_end) #the entire time period before/after the treatment
# Run the synthetic control analysis
synth.out <- synth(dataprep.out)
X1, X0, Z1, Z0 all come directly from dataprep object.
****************
searching for synthetic control unit
****************
****************
****************
MSPE (LOSS V): 1.715837
solution.v:
0.9075486 0.09245137
solution.w:
0.06414134 0.06721226 0.2217716 0.3705654 0.0999262 0.08313581 0.09324734
# Plot the results
path.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)
gaps.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Gap in Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)
print(dataprep.out$names.and.numbers)
evaluate_gaps(dataprep.out, pre_start, pre_end, post_length)
[1] 1.51598e-06
[1] -39.15297
PLACEBO TESTING: IF UNBIASED, RESULTS SHOULD BE MUCH CLOSER TO
ZERO
post_length <- 22*3
pre_length <- 22*3
post_end <- event_date - 1
pre_end <- post_end - post_length + 1
pre_start <- pre_end - pre_length
aa <- df %>%
filter((period >= pre_start) & (period <= post_end))
dataprep.out <- dataprep(foo = aa,
time.predictors.prior = pre_start:pre_end,
predictors = c('price','returns'),
predictors.op = 'mean',
dependent = 'price', # dv
unit.variable = 'tickerID', #identifying unit numbers
unit.names.variable = 'ticker', #identifying unit names
time.variable = 'period', #time-periods
treatment.identifier = 1, #the treated case
controls.identifier = 2:8, #the control cases; all others except target
time.optimize.ssr = pre_start:pre_end, #the time-period over which to optimize
time.plot = pre_start:post_end) #the entire time period before/after the treatment
# Run the synthetic control analysis
synth.out <- synth(dataprep.out)
X1, X0, Z1, Z0 all come directly from dataprep object.
****************
searching for synthetic control unit
****************
****************
****************
MSPE (LOSS V): 2.890948
solution.v:
1 0
solution.w:
0.08772161 0.083939 0.112727 0.4109511 0.08952511 0.1018565 0.1132797
# Plot the results
path.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)
gaps.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Gap in Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)
print(dataprep.out$names.and.numbers)
evaluate_gaps(dataprep.out, pre_start, pre_end, post_length)
[1] 0.02647449
[1] -0.1771741
---
title: "Equifax Breach Synthetic Control"
author: "Jonathan Hershaff"
date: "2024-06-01"
output: html_notebook
---


#### This program utilizes causal inference methods to estimate the impact of a bad event. Specifically, we estimate the 3-month impact of the 2017 Equifax data breach on stock prices using a synthetic control of competitor and market ETF prices. 

#### Importantly, this skill is highly transferable to other domains. For example, we could estimate the impact of expanding marketing efforts to a new region/country in a non-experimental setting using other untreated countries as a synthetic control. 


```{r install-packages}
install.packages("dplyr")
install.packages("ggplot2")
install.packages("Synth")
library(dplyr)
library(Synth)
library(ggplot2)
```

#### FUNCTIONS CALCULATE MEAN GAP BETWEEN OBSERVED VALUE AND SYNTHETIC CONTROL FOR DEFINED PERIODS

```{r functions}
evaluate_gaps <- function(dataprep.out, pre_start, pre_end, post_length) {
  Y1 <- dataprep.out$Y1plot
  Y0_weighted <- dataprep.out$Y0plot %*% synth.out$solution.w
  
  # Calculate the gap
  gap <- Y1 - Y0_weighted
  
  # Combine them into a dataframe
  result_df <- data.frame(
    Y1 = as.vector(Y1),
    Y0_weighted = as.vector(Y0_weighted),
    gap = as.vector(gap)
  )
  
  result_df$index <- as.numeric(rownames(result_df))
  
  if(nrow(result_df) == (post_end-pre_start+1)) {
    post_start <- nrow(result_df) - post_length + 1
    pre_df <- result_df[result_df$index < post_start, ]
    post_df <- result_df[result_df$index >= post_start, ]
    
    print(mean(pre_df$gap))
    print(mean(post_df$gap))
  } 
  else {
    print("Inconsistent length of results")
  }
}
```


#### PULL IN PANEL DATA ON STOCK PRICES FROM YAHOO FINANCE

```{r import-stock-price-panel}
prices <- read.csv('equifax_breach_prices.csv')
prices$date <- as.Date(prices$date)

efx <- prices %>% filter(ticker=='EFX')

ggplot(efx, aes(x = date, y = price, color = ticker)) +
  geom_line() +
  labs(title = "Stock Prices Over Time", x = "Date", y = "Price", color = "Ticker") +
  theme_minimal()


ggplot(prices, aes(x = date, y = price, color = ticker)) +
  geom_line(aes(size = ifelse(ticker == "EFX", "EFX", "Others"))) +
  scale_size_manual(values = c("EFX" = 1.5, "Others" = 0.5), guide='none') +
  labs(title = "Stock Prices Over Time", x = "Date", y = "Price", color = "Ticker") +
  theme_minimal()


```


#### PREPARE DATA FOR SYNTHETIC CONTROL: ASSIGN SEQUENTIAL IDs FOR TICKER SYMBOLS AND TRADING DATES

```{r unique-periodID-for-each-date}
unique_dates <- prices %>%
  select(date) %>%
  distinct() %>%
  arrange(date) %>%
  mutate(period = row_number())

```

```{r unique-tickerID-for-each-ticker}
unique_tickers <- prices %>%
  select(ticker) %>%
  distinct() %>%
  mutate(tickerID = ifelse(ticker=='EFX', 1, NA)) %>%
  arrange(tickerID, ticker) %>%
  mutate(tickerID = ifelse(is.na(tickerID), row_number(), tickerID))
```


```{r join-unique-IDs}

df <- prices %>%
  left_join(unique_dates, by = 'date')

df <- df %>%
  left_join(unique_tickers, by = 'ticker')

# Convert group_id to integer
df$tickerID <- as.integer(df$tickerID)
df$period <- as.integer(df$period)

other_columns <- setdiff(names(df), c("tickerID","period"))
df <- df[, c("tickerID","period", other_columns)]

df <- as.data.frame(df)
df <- df %>% filter(!is.na(returns))

```


#### DEFINE PARAMETERS AND ESTIMATE SYNTHETIC CONTROL (~22 TRADING DAYS PER MONTH)

``` {r Synth} 

event_date <- 215
post_length <- 22*3
pre_length <- 22*3
pre_end <- event_date - 1
pre_start <- pre_end - pre_length 
post_end <- pre_end + post_length 

df1 <- df %>% 
  filter((period >= pre_start) & (period <= post_end))

    dataprep.out <- dataprep(foo = df1,
     time.predictors.prior = pre_start:pre_end, 
     predictors = c('price','returns'),
     predictors.op = 'mean',
     dependent = 'price', # dv
     unit.variable = 'tickerID', #identifying unit numbers
     unit.names.variable = 'ticker', #identifying unit names
     time.variable = 'period', #time-periods
     treatment.identifier = 1, #the treated case
     controls.identifier = 2:8, #the control cases; all others except target
     time.optimize.ssr = pre_start:pre_end, #the time-period over which to optimize
     time.plot = pre_start:post_end) #the entire time period before/after the treatment

# Run the synthetic control analysis
synth.out <- synth(dataprep.out)

# Plot the results
path.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)  
gaps.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Gap in Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)  

print(dataprep.out$names.and.numbers)


evaluate_gaps(dataprep.out, pre_start, pre_end, post_length)


```

#### PLACEBO TESTING: IF UNBIASED, RESULTS SHOULD BE MUCH CLOSER TO ZERO

```{r AA-test}

post_length <- 22*3
pre_length <- 22*3
post_end <- event_date - 1 
pre_end <- post_end - post_length + 1
pre_start <- pre_end - pre_length 

aa <- df %>% 
  filter((period >= pre_start) & (period <= post_end))

    dataprep.out <- dataprep(foo = aa,
     time.predictors.prior = pre_start:pre_end, 
     predictors = c('price','returns'),
     predictors.op = 'mean',
     dependent = 'price', # dv
     unit.variable = 'tickerID', #identifying unit numbers
     unit.names.variable = 'ticker', #identifying unit names
     time.variable = 'period', #time-periods
     treatment.identifier = 1, #the treated case
     controls.identifier = 2:8, #the control cases; all others except target
     time.optimize.ssr = pre_start:pre_end, #the time-period over which to optimize
     time.plot = pre_start:post_end) #the entire time period before/after the treatment

# Run the synthetic control analysis
synth.out <- synth(dataprep.out)

# Plot the results
path.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)  
gaps.plot(dataprep.res = dataprep.out, synth.res = synth.out, Ylab = "Gap in Outcome", Xlab = "Date")
abline(v = pre_end, col = "red", lwd = 2, lty = 2)  

print(dataprep.out$names.and.numbers)

evaluate_gaps(dataprep.out, pre_start, pre_end, post_length)

```


