#!/usr/bin/env python # coding: utf-8 # In[1]: #%% # pyscience imports import os import sys import glob import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf # graphics from plotnine import * import matplotlib.pyplot as plt import seaborn as sns # plt.style.use("seaborn-darkgrid") sns.set(style="ticks", context="talk") plt.style.use("dark_background") # dark bg plots # %matplotlib inline # run for jupyter notebook from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = 'all' # In[2]: get_ipython().run_line_magic('pwd', '') # # Regressions # In[3]: import urllib import zipfile import urllib.request # In[4]: dldat = False if dldat: # Download data and unzip urllib.request.urlretrieve('http://economics.mit.edu/files/397', 'asciiqob.zip') with zipfile.ZipFile('asciiqob.zip', "r") as z: z.extractall() # In[3]: # Read the data into a pandas dataframe pums = pd.read_csv('asciiqob.txt', header = None, delim_whitespace = True) pums.columns = ['lwklywge', 'educ', 'yob', 'qob', 'pob'] pums.head() # In[6]: model = smf.ols('lwklywge ~ educ', data=pums) print(model.fit().summary()) #%% results = model.fit() educ_coef = results.params[1] intercept = results.params[0] #%% # In[7]: # Calculate means by educ attainment and predicted values groupbyeduc = pums.groupby('educ') educ_means = groupbyeduc['lwklywge'].mean() yhat = pd.Series(intercept + educ_coef * educ_means.index.values, index = educ_means.index.values) #%% # In[8]: sumstats = educ_means.reset_index() sumstats['intercept'] = 1 sumstats['yhat'] = results.predict(sumstats) sumstats # In[9]: # Create plot plt.figure() educ_means.plot(style='-o') yhat.plot() plt.xlabel("Years of completed education") plt.ylabel("Log weekly earnings, \$2003") plt.suptitle("Education ~ Wage CEF and linear fit") plt.legend().set_visible(False) # In[10]: ggplot(sumstats, aes(x = 'educ')) + geom_point(aes(y = 'lwklywge')) + geom_line(aes(y='yhat'), color = 'red') +\ labs(x = 'Years of Completed Education', y = 'Log Weekly earnings, $2003', title = 'Education ~ Wage CEF and linear fit') # # Propensity Score # In[11]: import patsy from tabulate import tabulate #%% # In[12]: #%% # Download data urllib.request.urlretrieve('http://economics.mit.edu/files/3828', 'nswre74.dta') urllib.request.urlretrieve('http://economics.mit.edu/files/3824', 'cps1re74.dta') urllib.request.urlretrieve('http://economics.mit.edu/files/3825', 'cps3re74.dta') #%% # Read the Stata files into Python nswre74 = pd.read_stata("nswre74.dta") cps1re74 = pd.read_stata("cps1re74.dta") cps3re74 = pd.read_stata("cps3re74.dta") # In[13]: # nswre74.head() ggplot(nswre74, aes(x = 're78')) + geom_histogram(bins=30) # In[14]: #%% # Store list of variables for summary summary_vars = ['age', 'ed', 'black', 'hisp', 'nodeg', 'married', 're74', 're75'] # Calculate propensity scores # Create formula for probit f = 'treat ~ ' + ' + '.join(['age', 'age2', 'ed', 'black', 'hisp', \ 'nodeg', 'married', 're74', 're75']) #%% # In[15]: # Run probit with CPS-1 y, X = patsy.dmatrices(f, cps1re74, return_type = 'dataframe') model = sm.Probit(y, X).fit() cps1re74['pscore'] = model.predict(X) # In[16]: # nswre74.head() ggplot(cps1re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density() # In[17]: #%% # Run probit with CPS-3 y, X = patsy.dmatrices(f, cps3re74, return_type = 'dataframe') model = sm.Probit(y, X).fit() cps3re74['pscore'] = model.predict(X) # In[18]: # nswre74.head() ggplot(cps3re74, aes(x = 'pscore')) + geom_histogram(bins=30) + geom_density() # In[19]: #%% # Create function to summarize data def summarize(dataset, conditions): stats = dataset[summary_vars][conditions].mean() stats['count'] = sum(conditions) return stats #%% # Summarize data nswre74_treat_stats = summarize(nswre74, nswre74.treat == 1) nswre74_control_stats = summarize(nswre74, nswre74.treat == 0) cps1re74_control_stats = summarize(cps1re74, cps1re74.treat == 0) cps3re74_control_stats = summarize(cps3re74, cps3re74.treat == 0) cps1re74_ptrim_stats = summarize(cps1re74, (cps1re74.treat == 0) & \ (cps1re74.pscore > 0.1) & \ (cps1re74.pscore < 0.9)) cps3re74_ptrim_stats = summarize(cps3re74, (cps3re74.treat == 0) & \ (cps3re74.pscore > 0.1) & \ (cps3re74.pscore < 0.9)) # In[20]: #%% # Combine summary stats, add header and print to markdown frames = [nswre74_treat_stats, nswre74_control_stats, cps1re74_control_stats, cps3re74_control_stats, cps1re74_ptrim_stats, cps3re74_ptrim_stats] #%% summary_stats = pd.concat(frames, axis = 1) header = ["NSW Treat", "NSW Control", \ "Full CPS-1", "Full CPS-3", \ "P-score CPS-1", "P-score CPS-3"] print(tabulate(summary_stats, header, tablefmt = "pipe")) # # Instrumental variables # In[21]: from matplotlib.ticker import FormatStrFormatter # ## Angrist and Krueger # In[22]: # Calculate means by educ and lwklywge groupbybirth = pums.groupby(['yob', 'qob']) birth_means = groupbybirth['lwklywge', 'educ'].mean() # In[39]: # Create function to plot figures def plot_qob(yvar, ax, title, ylabel): values = yvar.values ax.plot(values, color = 'k') for i, y in enumerate(yvar): qob = yvar.index.get_level_values('qob')[i] ax.annotate(qob, (i, y), xytext = (-5, 5), textcoords = 'offset points') if qob == 1: ax.scatter(i, y, marker = 's', facecolors = 'none', edgecolors = 'y') else: ax.scatter(i, y, marker = 's', color = 'r') ax.set_xticks(range(0, len(yvar), 4)) ax.set_xticklabels(yvar.index.get_level_values('yob')[1::4]) ax.set_title(title) ax.set_ylabel(ylabel) ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f')) ax.set_xlabel("Year of birth") ax.margins(0.1) # In[40]: fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 9), sharex = True) plot_qob(yvar = birth_means['educ'], ax = ax1, title = 'A. Average education by quarter of birth (first stage)', ylabel = 'Years of education') plot_qob(yvar = birth_means['lwklywge'], ax = ax2, title = 'B. Average weekly wage by quarter of birth (reduced form)', ylabel = 'Log weekly earnings') fig.tight_layout() # ### IV estimation # In[26]: from linearmodels.iv import IV2SLS # In[31]: qobs = pd.get_dummies(pums.qob, prefix='qob') pums = pd.concat([pums, qobs], axis=1) # In[32]: pums.head() # In[37]: formula = 'lwklywge ~ 1 [educ ~ qob_2 + qob_3 + qob_4]' mod = IV2SLS.from_formula(formula, pums).fit(cov_type='robust') # In[38]: print(mod.summary) # # Clustering / Fixed Effects # In[7]: import statsmodels.stats.sandwich_covariance as sw # In[8]: base_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='HC1') print(base_mincer.summary()) # In[9]: cluster_mincer = smf.ols(formula='lwklywge ~ educ', data=pums).fit(cov_type='cluster', cov_kwds={'groups': pums['pob']}, use_t=True) print(cluster_mincer.summary()) # In[10]: fe_mincer = smf.ols(formula='lwklywge ~ educ + C(pob)', data=pums).fit(cov_type='HC1', use_t=True) print(fe_mincer.summary())