#!/usr/bin/env python # coding: utf-8 # #
# # Website # #
# # Ghani, Rayid, Frauke Kreuter, Julia Lane, Adrianne Bradford, Alex Engler, Nicolas Guetta Jeanrenaud, Graham Henke, Daniela Hochfellner, Clayton Hunter, Brian Kim, Avishek Kumar, and Jonathan Morgan. # # _to be updated on export_ # # Imputing Missing Values (Employers and Employees Data) # ---- # ## Python Setup # - Back to [Table of Contents](#Table-of-Contents) # In[ ]: get_ipython().run_line_magic('pylab', 'inline') import pandas as pd import psycopg2 from sqlalchemy import create_engine import matplotlib.pyplot as plt import statsmodels.api as sm # In[ ]: db_name = "appliedda" hostname = "10.10.2.10" conn = psycopg2.connect(database=db_name, host = hostname) #database connection # ## Employer Data # Let's use wage and employee statistics from the MO wage records. # In[ ]: conn = psycopg2.connect(database = db_name, host = hostname) # In[ ]: query = ''' SELECT ein, seinunit, empr_no , empl_month1::int + empl_month2::int + empl_month3::int AS total_empl , total_wages FROM il_des_kcmo.il_qcew_employers WHERE year = 2013 AND quarter = 1 ''' # In[ ]: df_wages = pd.read_sql(query, conn) df_wages = df_wages[df_wages['ein'].notnull()] # ### Observing Missing Values # In[ ]: df_wages.head() # Unfortunately, some observations present some irregularities or data inputs that do not make sense: # > Some employers have 0 total employees and/or 0 total wages. # In[ ]: df_wages.query('total_empl'==0 or 'total_wages'==0).head() # In[ ]: df_wages[(df_wages['total_empl']==0) | (df_wages['total_wages']==0)].head() # How extensive is this issue? # In[ ]: missing = df_wages[(df_wages['total_empl']==0) | (df_wages['total_wages']==0)].shape[0] # In[ ]: print("{:,.0f} observations have either missing employees or wages.".format(missing)) # **Can we impute these missing values in any reasonable way?** # ### Imputing Method 1: Imputing to the median (or mean) # One first method for imputing missing values is simply to presume that missing values are equal to the sample's median (or mean). In this case, we simply calculate the samples median, and replace missing values by that value. # In[ ]: df_wages1 = df_wages.copy() # In[ ]: # Let's subset df_wages to observations where the 2 variables are not missing: df_wages1_s = df_wages[(df_wages1['total_empl']>0) & (df_wages1['total_wages']>0)] # In[ ]: med_wage = df_wages1_s['total_wages'].median() med_wage # In[ ]: med_empl = df_wages1_s['total_empl'].median() med_empl # In[ ]: print('The median company has {:.0f} employees and pays out {:,.0f} in wages.'.format( med_empl, med_wage)) # Let's use these values to impute the missing ones. # In[ ]: df_wages1['total_empl'] = df_wages1['total_empl'].replace(0, med_empl) df_wages1['total_wages'] = df_wages1['total_wages'].replace(0, med_wage) # Did the operation work? # In[ ]: missing = df_wages1[(df_wages1['total_empl']==0) | (df_wages1['total_wages']==0)].shape[0] # In[ ]: print('operation {} work'.format('did' if missing==0 else 'did not')) # ### Imputing Method 2: Regression Imputation # In[ ]: df_wages2 = df_wages.copy() # The missing values can be split into three categories: # In[ ]: # Missing wages and employees df_wages2[(df_wages2['total_empl']==0) & (df_wages2['total_wages']==0)].shape # In[ ]: # Missing employees but non-missing wages df_wages2[(df_wages2['total_empl']==0) & (df_wages2['total_wages']!=0)].shape # In[ ]: # Missing wages but non-missing employees df_wages2[(df_wages2['total_empl']!=0) & (df_wages2['total_wages']==0)].shape # > In some observations, we have information on the employees but not the wages. In other cases, we have information on the wages but not the employees. # # Intuitively, these 2 variables are probably related. Is this the case? # In[ ]: plt.scatter(df_wages2['total_empl'], df_wages2['total_wages']) # In[ ]: # Subset to lower wages and employees: df_wages2_s = df_wages2[(df_wages2['total_empl']<6000)&(df_wages2['total_wages']<20000000)] plt.scatter(df_wages2_s['total_empl'], df_wages2_s['total_wages']) # > **Let's start by imputing Wage from Number of Employees:** # In[ ]: model = sm.OLS(df_wages2['total_wages'], df_wages2['total_empl']).fit() model.summary() # In[ ]: df_wages2['total_wage_p'] = model.predict(df_wages2['total_empl']) # > **Now let's impute number of employees from wages:** # In[ ]: model = sm.OLS(df_wages2['total_empl'], df_wages2['total_wages']).fit() model.summary() # In[ ]: df_wages2['total_empl_p'] = model.predict(df_wages2['total_wages']) # **How do results look?** # In[ ]: df_wages2.head() # **Let's replace employees and wages when relevant:** # In[ ]: df_wages2['total_empl'] = np.where(df_wages2['total_empl']==0, df_wages2['total_empl_p'], df_wages2['total_empl']) # In[ ]: df_wages2['total_wages'] = np.where(df_wages2['total_wages']==0, df_wages2['total_wage_p'], df_wages2['total_wages']) # **Did the imputation work?** # In[ ]: # Missing employees but non-missing wages df_wages2[(df_wages2['total_empl']==0) & (df_wages2['total_wages']!=0)].shape # In[ ]: # Missing wages but non-missing employees missing2 = df_wages2[(df_wages2['total_empl']!=0) & (df_wages2['total_wages']==0)].shape[0] # In[ ]: print('operation {} work'.format('did' if missing2==0 else 'did not')) # ## Employee Data # Let's query the Wage data, and add the NAICS industry code from the Employers data. # In[ ]: conn = psycopg2.connect(database = db_name, host = hostname) # In[ ]: query = ''' SELECT a.ssn, a.ein, b.naics, a.wage FROM il_des_kcmo.il_wage as a LEFT JOIN il_des_kcmo.il_qcew_employers as b ON a.ein = b.ein and a.seinunit = b.seinunit and a.empr_no = b.empr_no WHERE a.year = 2011 AND a.quarter = 1 and b.year = 2011 and b.quarter = 1 ''' # In[ ]: df_wages = pd.read_sql(query, conn) # In[ ]: df_wages.shape # ### Observing Missing Values # In[ ]: df_wages.shape # In[ ]: df_wages.tail() # Unfortunately, some observations present some irregularities or data inputs that do not make sense: # > Some employees have 0 total wages. # In[ ]: df_wages[df_wages['wage'] == 0].head() # How extensive is this issue? # In[ ]: print('{:,.0f} observations have missing wages.'.format( df_wages[df_wages['wage'] == 0].shape[0])) # **Can we impute these missing values in any reasonable way?** # ### Imputing Method 1: Imputing to the median (or mean) # One first method for imputing missing values is simply to presume that missing values are equal to the sample's median (or mean). In this case, we simply calculate the samples median, and replace missing values by that value. # In[ ]: df_wages1 = df_wages.copy() # In[ ]: # Let's subset df_wages to observations where the wages are not missing: df_wages1_s = df_wages[df_wages1['wage']>0] # In[ ]: med_wage = df_wages1_s['wage'].median() med_wage # Let's replace missing wages with the median # In[ ]: df_wages1['wage'] = df_wages1['wage'].replace(0, med_wage) # Did the operation work? # In[ ]: missing3 = df_wages1[df_wages1['wage']==0].shape[0] print('operation {} work'.format('did' if missing3==0 else 'did not')) # ### Imputing Method 2: Imputing based on Industry # In[ ]: df_wages2 = df_wages.copy() # Median wages are probably very different based on Industry. Let's take a look: # In[ ]: # Create 2-Digit Industry Code df_wages2['naics2'] = df_wages['naics'].str[:2] # In[ ]: df_wages2[df_wages2['wage']>0].shape # In[ ]: df_wages2[df_wages2['wage']>0]['naics2'].value_counts() # In[ ]: # Group data bu 2-Digit NAICS code and calculate median wage. df_wages2_gb = df_wages2[df_wages2['wage']>0] df_wages2_gb = df_wages2_gb.groupby('naics2')['wage'].median().reset_index() df_wages2_gb.head() # In[ ]: # df_wages2_gb = df_wages2_gb['wage'].median().reset_index() df_wages2_gb = df_wages2_gb.rename(index = str, columns = {'wage': 'median_wage'}) # In[ ]: # Visualize median wage by industry code plt.bar(df_wages2_gb['naics2'], df_wages2_gb['median_wage']) # In[ ]: # Merge median wage back onto the data df_wages2 = pd.merge(df_wages2, df_wages2_gb, how = 'left', on = 'naics2') # **Let's look at the data:** # In[ ]: df_wages2.head() # **Let's replace employees and wages when relevant:** # In[ ]: df_wages2['wage'] = np.where(df_wages2['wage']==0, df_wages2['median_wage'], df_wages2['wage']) # **Did the imputation work?** # In[ ]: # Missing employees but non-missing wages missing4 = df_wages2[df_wages2['wage']==0].shape[0] print('operation {} work'.format('did' if missing4==0 else 'did not'))