#!/usr/bin/env python
# coding: utf-8

# In[1]:


#%%
# pyscience imports
import os
import sys
import glob
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# graphics
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use("seaborn-darkgrid")
sns.set(style="ticks", context="talk")
plt.style.use("dark_background") # dark bg plots

# %matplotlib inline
# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# In[2]:


get_ipython().run_line_magic('pwd', '')


# # Regressions 

# In[3]:


import urllib
import zipfile
import urllib.request


# In[4]:


dldat = False
if dldat:
    # Download data and unzip 
    urllib.request.urlretrieve('http://economics.mit.edu/files/397', 'asciiqob.zip')
    with zipfile.ZipFile('asciiqob.zip', "r") as z:
       z.extractall()


# In[3]:


# Read the data into a pandas dataframe
pums         = pd.read_csv('asciiqob.txt',
                           header           = None,
                           delim_whitespace = True)
pums.columns = ['lwklywge', 'educ', 'yob', 'qob', 'pob']
pums.head()


# In[6]:


model      = smf.ols('lwklywge ~ educ', data=pums)
print(model.fit().summary())
#%%
results    = model.fit()
educ_coef  = results.params[1]
intercept  = results.params[0]
#%%


# In[7]:


# Calculate means by educ attainment and predicted values
groupbyeduc = pums.groupby('educ')
educ_means  = groupbyeduc['lwklywge'].mean()
yhat        = pd.Series(intercept + educ_coef * educ_means.index.values,
                        index = educ_means.index.values)
#%%


# In[8]:


sumstats = educ_means.reset_index()
sumstats['intercept'] = 1
sumstats['yhat'] = results.predict(sumstats)
sumstats


# In[9]:


# Create plot
plt.figure()
educ_means.plot(style='-o')
yhat.plot()
plt.xlabel("Years of completed education")
plt.ylabel("Log weekly earnings, \$2003")
plt.suptitle("Education ~ Wage CEF and linear fit")
plt.legend().set_visible(False)


# In[10]:


ggplot(sumstats, aes(x = 'educ')) + geom_point(aes(y = 'lwklywge')) + geom_line(aes(y='yhat'), color = 'red') +\
    labs(x = 'Years of Completed Education', y = 'Log Weekly earnings, $2003', title = 'Education ~ Wage CEF and linear fit')


# # Propensity Score

# In[11]:


import patsy
from tabulate import tabulate
#%%


# In[12]:


#%%
# Download data
urllib.request.urlretrieve('http://economics.mit.edu/files/3828', 'nswre74.dta')
urllib.request.urlretrieve('http://economics.mit.edu/files/3824', 'cps1re74.dta')
urllib.request.urlretrieve('http://economics.mit.edu/files/3825', 'cps3re74.dta')
#%%
# Read the Stata files into Python
nswre74  = pd.read_stata("nswre74.dta")
cps1re74 = pd.read_stata("cps1re74.dta")
cps3re74 = pd.read_stata("cps3re74.dta")


# In[13]:


# nswre74.head()
ggplot(nswre74, aes(x = 're78')) + geom_histogram(bins=30)


# In[14]:


#%%
# Store list of variables for summary
summary_vars = ['age', 'ed', 'black', 'hisp', 'nodeg', 'married', 're74', 're75']

# Calculate propensity scores
# Create formula for probit
f = 'treat ~ ' + ' + '.join(['age', 'age2', 'ed', 'black', 'hisp', \
                             'nodeg', 'married', 're74', 're75'])
#%%


# In[15]:


# Run probit with CPS-1
y, X   = patsy.dmatrices(f, cps1re74, return_type = 'dataframe')
model  = sm.Probit(y, X).fit()
cps1re74['pscore'] = model.predict(X)


# In[16]:


# nswre74.head()
ggplot(cps1re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density()


# In[17]:


#%%
# Run probit with CPS-3
y, X   = patsy.dmatrices(f, cps3re74, return_type = 'dataframe')
model  = sm.Probit(y, X).fit()
cps3re74['pscore'] = model.predict(X)


# In[18]:


# nswre74.head()
ggplot(cps3re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density()


# In[19]:


#%%
# Create function to summarize data
def summarize(dataset, conditions):
  stats          = dataset[summary_vars][conditions].mean()
  stats['count'] = sum(conditions)
  return stats
#%%
# Summarize data
nswre74_treat_stats    = summarize(nswre74, nswre74.treat == 1)
nswre74_control_stats  = summarize(nswre74, nswre74.treat == 0)
cps1re74_control_stats = summarize(cps1re74, cps1re74.treat == 0)
cps3re74_control_stats = summarize(cps3re74, cps3re74.treat == 0)
cps1re74_ptrim_stats   = summarize(cps1re74, (cps1re74.treat == 0)   & \
                                             (cps1re74.pscore > 0.1) & \
                                             (cps1re74.pscore < 0.9))
cps3re74_ptrim_stats   = summarize(cps3re74, (cps3re74.treat == 0)   & \
                                             (cps3re74.pscore > 0.1) & \
                                             (cps3re74.pscore < 0.9))


# In[20]:


#%%
# Combine summary stats, add header and print to markdown
frames = [nswre74_treat_stats,
          nswre74_control_stats,
          cps1re74_control_stats,
          cps3re74_control_stats,
          cps1re74_ptrim_stats,
          cps3re74_ptrim_stats]
#%%
summary_stats = pd.concat(frames, axis = 1)
header        = ["NSW Treat", "NSW Control", \
                 "Full CPS-1", "Full CPS-3", \
                 "P-score CPS-1", "P-score CPS-3"]

print(tabulate(summary_stats, header, tablefmt = "pipe"))


# # Instrumental variables

# In[21]:


from matplotlib.ticker import FormatStrFormatter


# ## Angrist and Krueger

# In[22]:


# Calculate means by educ and lwklywge
groupbybirth = pums.groupby(['yob', 'qob'])
birth_means  = groupbybirth['lwklywge', 'educ'].mean()


# In[39]:


# Create function to plot figures
def plot_qob(yvar, ax, title, ylabel):
    values = yvar.values
    ax.plot(values, color = 'k')

    for i, y in enumerate(yvar):
        qob = yvar.index.get_level_values('qob')[i]
        ax.annotate(qob,
                    (i, y),
                    xytext = (-5, 5),
                    textcoords = 'offset points')
        if qob == 1:
            ax.scatter(i, y, marker = 's', facecolors = 'none', edgecolors = 'y')
        else:
            ax.scatter(i, y, marker = 's', color = 'r')

    ax.set_xticks(range(0, len(yvar), 4))
    ax.set_xticklabels(yvar.index.get_level_values('yob')[1::4])
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax.set_xlabel("Year of birth")
    ax.margins(0.1)


# In[40]:


fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 9), sharex = True)

plot_qob(yvar   = birth_means['educ'],
         ax     = ax1,
         title  = 'A. Average education by quarter of birth (first stage)',
         ylabel = 'Years of education')

plot_qob(yvar   = birth_means['lwklywge'],
         ax     = ax2,
         title  = 'B. Average weekly wage by quarter of birth (reduced form)',
         ylabel = 'Log weekly earnings')

fig.tight_layout()


# ### IV estimation

# In[26]:


from linearmodels.iv import IV2SLS


# In[31]:


qobs = pd.get_dummies(pums.qob, prefix='qob')
pums = pd.concat([pums, qobs], axis=1)


# In[32]:


pums.head()


# In[37]:


formula = 'lwklywge ~ 1 [educ ~ qob_2 + qob_3 + qob_4]'
mod = IV2SLS.from_formula(formula, pums).fit(cov_type='robust')


# In[38]:


print(mod.summary)


# # Clustering / Fixed Effects 

# In[7]:


import statsmodels.stats.sandwich_covariance as sw


# In[8]:


base_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='HC1')
print(base_mincer.summary())


# In[9]:


cluster_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='cluster',
                                                        cov_kwds={'groups': pums['pob']},
                                                        use_t=True)
print(cluster_mincer.summary())


# In[10]:


fe_mincer = smf.ols(formula='lwklywge ~ educ + C(pob)', data=pums).fit(cov_type='HC1',
                                                        use_t=True)
print(fe_mincer.summary())