#!/usr/bin/env python # coding: utf-8 # In[3]: import numpy as np import matplotlib.pyplot as plt import pandas as pd import scipy as sp import scipy.stats import statsmodels.api as sm # # Model Specification # So far we have assumed correct **model specification**, if not correctly specified, we are encountering **model specification errors**. Here's the ideal criteria for specifying a model # 1. Be able to make logical prediction. # 2. Be consistent with theory. # 3. Independent variables are uncorrelated with disturbance terms. # 4. Parameter constancy. # 5. Residuals should be white noise. # # However, in practice, model specification errors are almost unavoidable, here are the list of how we might encounter specification errors. # # 1. Omission of a relevant variable(s). # 2. Inclusion of an unnecessary variable(s). # 3. Adoption of the wrong functional form. # 4. Errors of measurement in data collection process. # 5. Incorrect specification of the stochastic error term. # 6. Assumption that the error term is normally distributed. # This chapter is exceedingly difficult in multiple independent variable case without linear algebra, we will simply scratch the surface of the topic. # # Omission of A Relevant Variable # Suppose the true relationship is # $$ # Y_i = 3 + 4X_2 + 5X_3 + u_i # $$ # However we estimate only $X_3$, i.e. the model with omission of $X_2$ # $$ # \hat{Y} = b_1+b_3X_3 # $$ # We will do $100000$ rounds of Monte Carlo simulation (might take some time, dial it down if necessary). # In[155]: n = 100 beta1, beta2, beta3 = 3, 4, 5 X2 = np.random.rand(n) X3 = np.random.rand(n) beta3_hat_list = [] for i in range(100000): u = np.random.randn(n) Y = beta1 + beta2 * X2 + beta3 * X3 + u df = pd.DataFrame( np.concatenate( (Y[:, np.newaxis], X2[:, np.newaxis], X3[:, np.newaxis]), axis=1 ), columns=["Y", "X2", "X3"], ) X = df["X3"] Y = df["Y"] X = sm.add_constant(X) # adding a constant model = sm.OLS(Y, X).fit() beta3_hat_list.append(model.params[1]) # Print the full reports of last round of simulation. # In[156]: model = sm.OLS(Y, X).fit() print_model = model.summary() # In[157]: fig, ax = plt.subplots(figsize=(12, 7)) ax.hist(beta3_hat_list, bins=150) ax.axvline(x=np.mean(beta3_hat_list), color="tomato", label="mean of $b_3$") ax.set_xlabel("$b_3$") ax.set_title("Sampling Distribution of $b_3$") ax.legend() plt.show() # In[97]: print("The mean of b3 is {}.".format(np.mean(beta3_hat_list))) # With $100000$ rounds of simulation, we could easily notice the sample distribution is biased, and mean of $b_3$ is far from true value $5$. We will demonstrate why this is the case without linear algebra. # We have shown in the second chapter, that estimator $b_3$ in a two-independent variable case is # $$ # b_{3}=\frac{\operatorname{Cov}\left(X_{3}, Y\right) \operatorname{Var}\left(X_{2}\right)-\operatorname{Cov}\left(X_{2}, Y\right) \operatorname{Cov}\left(X_{3}, X_{2}\right)}{\operatorname{Var}\left(X_{3}\right) \operatorname{Var}\left(X_{2}\right)-\left[\operatorname{Cov}\left(X_{3}, X_{2}\right)\right]^{2}} # $$ # However, without being aware the omission of $\beta_2$, we end up using estimator # $$ # b_{3}=\frac{\operatorname{Cov}\left(X_{3}, Y\right)}{\operatorname{Var}\left(X_{3}\right)} # $$ # Substitute $Y$ by its true relationship can share some insight # $$ # \begin{aligned} # b_{3} &=\frac{\operatorname{Cov}\left(X_{3}, Y\right)}{\operatorname{Var}\left(X_{3}\right)}=\frac{\operatorname{Cov}\left(X_{3},\left[\beta_{1}+\beta_{2} X_{2}+\beta_{3} X_{3}+u\right]\right)}{\operatorname{Var}\left(X_{3}\right)} \\ # &=\frac{1}{\operatorname{Var}\left(X_{3}\right)}\left[\operatorname{Cov}\left(X_{3}, \beta_{1}\right)+\operatorname{Cov}\left(X_{3}, \beta_{2} X_{2}\right)+\operatorname{Cov}\left(X_{3}, \beta_{3} X_{3}\right)+\operatorname{Cov}\left(X_{3}, u\right)\right] \\ # &=\frac{1}{\operatorname{Var}\left(X_{3}\right)}\left[0+\beta_{2} \operatorname{Cov}\left(X_{2}, X_3\right)+\beta_{3} \operatorname{Var}\left(X_{3} \right)+\operatorname{Cov}\left(X_{3}, u\right)\right] \\ # &=\beta_{3}+\beta_{2} \frac{\operatorname{Cov}\left(X_{2}, X_{3}\right)}{\operatorname{Var}\left(X_{3}\right)}+\frac{\operatorname{Cov}\left(X_{3}, u\right)}{\operatorname{Var}\left(X_{3}\right)} # \end{aligned} # $$ # This is the relationship of $b_3$ and $\beta_3$ when $b_2$ is omitted in the model. Whether upward biased or downward biased, it depends on the sign of $\beta_2$ and covariance of $X_2$ and $X_3$. And it's safe to assume $\operatorname{Cov}(X_3, u )=0$. # As you can see now the biased term is # $$ # \beta_{2} \frac{\operatorname{Cov}\left(X_{2}, X_{3}\right)}{\operatorname{Var}\left(X_{3}\right)} # $$ # Therefore $b_3$ can be unbiased after all on condition that $\operatorname{Cov}(X_2, X_3)=0$, however this is extremely unlikely. # # In general, omission of relevant variables also cause invalid hypotheses test, however we'll skip it entirely. # # Inclusion Of An Unnecessary Variable # Now again suppose the true relationship is # $$ # Y_i = 3 + 4X_2 + u_i # $$ # However we include an unnecessary variable $X_3$, i.e. # $$ # \hat{Y} = b_1+b_2X_2 + b_3X_3 # $$ # In[149]: n = 100 beta1, beta2 = 3, 4 X2 = np.random.rand(n) X3 = np.random.rand(n) beta2_hat_list = [] for i in range(100000): u = np.random.randn(n) Y = beta1 + beta2 * X2 + u df = pd.DataFrame( np.concatenate( (Y[:, np.newaxis], X2[:, np.newaxis], X3[:, np.newaxis]), axis=1 ), columns=["Y", "X2", "X3"], ) X = df[["X2", "X3"]] Y = df["Y"] X = sm.add_constant(X) # adding a constant model = sm.OLS(Y, X).fit() beta2_hat_list.append(model.params[1]) # Print the full reports of last round of simulation. # In[150]: print_model = model.summary() print(print_model) # In[151]: fig, ax = plt.subplots(figsize=(12, 7)) ax.hist(beta2_hat_list, bins=150) ax.axvline(x=np.mean(beta2_hat_list), color="tomato", label="mean of $b_2$") ax.set_xlabel("$b_2$") ax.set_title("Sampling Distribution of $b_2$") ax.legend() plt.show() # It turns out that the $b_2$ is not biased after all! However, let's compare with $b_2$'s sampling distribution which is from correct specification. # In[133]: beta2_hat_list_correct = [] for i in range(100000): u = np.random.randn(n) Y = beta1 + beta2 * X2 + u df = pd.DataFrame( np.concatenate((Y[:, np.newaxis], X2[:, np.newaxis]), axis=1), columns=["Y", "X2"], ) X = df["X2"] Y = df["Y"] X = sm.add_constant(X) # adding a constant model = sm.OLS(Y, X).fit() beta2_hat_list_correct.append(model.params[1]) # In[146]: fig, ax = plt.subplots(figsize=(12, 7)) ax.hist(beta2_hat_list, bins=150, alpha=0.5, label="$b_2$ With An Unnecessary Variable") ax.hist( beta2_hat_list_correct, bins=150, alpha=0.3, color="tomato", label="$b_2$ With Correct Model", ) ax.set_xlabel("$b_2$") ax.set_title("Sampling Distribution of $b_2$") ax.legend() plt.show() # If you take a closer look at the distribution, they are actually different, the $b_2$ estimated by correct model has a more concentrated distribution, i.e. the middle area is higher, in contrast the $b_2$ estimated by incorrectly specified model has fatter tails. We can check their standard deviation. # In[135]: np.std(beta2_hat_list) # In[136]: np.std(beta2_hat_list_correct) # To remind you why this is the case, here we reproduce the formula of standard deviation of $b_2$ in two independent variable regression # $$ # \sigma_{b_{2}}^{2}=\frac{\sigma_{u}^{2}}{n \operatorname{Var}\left(X_{2}\right)} \frac{1}{1-r_{X_{2} X_{3}}^{2}} # $$ # $\sigma_{b_{2}}^{2}$ is enlarged by a the correlation $r^2_{X_2X_3}$, which is why the blue color distribution has fatter tails. #

# Asymmetry of Specification Errors
# The asymmetry of two types of specification errors: adding irrelevant variables still grant us unbiased and consistent estimate though standard errors are inflated, however excluding a relevant variable would cause bias and inconsistency.

# # Does it suggest that we should err on the side of adding irrelevant variables? In the field of econometrics, not really. The _best practice_ is to include only explanatory variables that, on theoretical grounds, directly influence the dependent variable and that are not accounted for by other # included variables. #

# # Tests of Specification Errors # Truth to be told, we will never be sure how the observed data were generated, but we can make educated guess about specification errors with the help of statistical tests. # ## Tests of Overfitted Models # The most convenient methods of detecting unnecessary variables are $F$ and $t$-tests, take a look the regression results with unnecessary variable, the $X_3$ has a p-value of $ 0.757$, we can surely deem $X_3$ as unnecessary. # # The purist econometrician objects the method of adding independent variables iteratively by testing $t$, however data scientists have a more practical view that they believe the model should be driven by data, i.e. the model should learn the data and express the data. # ## Tests of Underfitted Models # To detect an underfitting model is more than looking at $t$'s or $F$'s, we also need to investigate the broad features of results, such $R^2$, signs of estimates, residuals and other relevant tests. # ### Investigating Residuals # Here we reproduce the underfitted model example. The unfitted model obviously has larger dispersion of residual. # In[185]: n = 200 beta1, beta2, beta3 = 3, 4, 5 X2 = np.random.rand(n) X3 = np.random.rand(n) u = np.random.randn(n) Y = beta1 + beta2 * X2 + beta3 * X3 + u df = pd.DataFrame( np.concatenate((Y[:, np.newaxis], X2[:, np.newaxis], X3[:, np.newaxis]), axis=1), columns=["Y", "X2", "X3"], ) X_underfit = df["X2"] Y = df["Y"] X_underfit = sm.add_constant(X_underfit) model_underfit = sm.OLS(Y, X_underfit).fit() X_wellfit = df[["X2", "X3"]] X_wellfit = sm.add_constant(X_wellfit) model_wellfit = sm.OLS(Y, X_wellfit).fit() fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(14, 12)) ax[0].scatter(np.arange(len(model_underfit.resid)), model_underfit.resid) ax[0].set_xlim(0, n) ax[0].set_ylim(-10, 10) ax[0].grid() ax[0].set_title("Residuals Plot of Underfitted Model") ax[1].scatter(np.arange(len(model_wellfit.resid)), model_wellfit.resid) ax[1].set_xlim(0, n) ax[1].set_ylim(-10, 10) ax[1].grid() ax[1].set_title("Residuals Plot of Wellfitted Model") plt.show() # ### Ramsey’s RESET Test # Ramsey's **Regression Equation Specification Error Test** (RESET) is general test for specification error. # In[216]: reset_results = sm.stats.diagnostic.linear_reset(model_underfit) # In[217]: print(reset_results.summary()) # ### Lagrange Multiplier (LM) Test for Adding Variables # # $AIC$ and $SIC$ # Besides $R^2$ and $\bar{R}^2$ that were discussed in the first two chapters. Here are another two statistics for model selection: **Akaike’s Information Criterion** (AIC) and **Bayesian information criterion** (BIC). # # Both statistics are standard output printed in the estimation report, you can check the report above. # ## **Akaike’s Information Criterion** # $AIC$ imposes stronger penalty than $\bar{R}^2$. The formula is # $$ # \mathrm{AIC}=e^{2 k / n} \frac{\sum e_{i}^{2}}{n}=e^{2 k / n} \frac{\mathrm{RSS}}{\mathrm{n}} # $$ # $AIC$ is commonly used in time series model to determine the lag length, where $n$ is number of observations and $k$ is the number of independent variables. # ## **Bayesian Information Criterion** # $BIC$ imposes even harsher penalty than $AIC$ # $$ # \mathrm{SIC}=n^{k / n} \frac{\sum e^{2}_i}{n}=n^{k / n} \frac{\mathrm{RSS}}{n} # $$ # As you can see from their formula, both criteria prefer smaller result, because $RSS$ is smaller. # # We can plot the $AIC$ and $BIC$ as functions of number of variables, it is easy to see that $BIC$ has higher penalty, however it doesn't meant $BIC$ is superior than $AIC$. # In[232]: n = 100 k = np.arange(1, 21) RSS = 1000 def aic(RSS): return np.exp(2 * k / n) * RSS / n def bic(RSS): return n ** (k / n) * RSS / n aic = aic(RSS) bic = bic(RSS) fig, ax = plt.subplots(figsize=(12, 7)) ax.plot(k, aic, label="AIC", lw=3) ax.plot(k, bic, label="BIC", lw=3) ax.legend() plt.show() # Also, both these criteria are commonly used in data science to compare **in-sample** and **out-of-sample** performance. # # Measurement Error # Keep in mind that any data might have certain extent of measurement error, either due to mis-recording or mis-communication. Most of time we assume the data are correctly measured, but that's more precise, we will discuss what consequences measurement error can cause with examples of simple linear regression. # ## Measurement Error in Independent Variables # Assume the true relationship is # $$ # Y_{i}=\beta_{1}+\beta_{2} Z_{i}+v_{i} # $$ # However due to some technical reason, we are unable to precisely measure $Z_i$,what we can observe is $X_i$ which has a relationship with $Z_i$ # $$ # X_{i}=Z_{i}+w_{i} # $$ # Combine them # $$ # Y_{i}=\beta_{1}+\beta_{2}\left(X_{i}-w_{i}\right)+v_{i}=\beta_{1}+\beta_{2} X_{i}+v_{i}-\beta_{2} w_{i} # $$ # The disturbance term $v_i-\beta_2w_i$ is a composite term and also $X_i$ not independent from composite disturbance term, because of common part $w_i$. # # Recall the estimator of $b_2$ can be decomposed as # $$ # b_{2}=\frac{\operatorname{Cov}(X, Y)}{\operatorname{Var}(X)}=\beta_{2}+\frac{\operatorname{Cov}(X, u)}{\operatorname{Var}(X)} # $$ # however in this case $X_i$ and $u_i$ are not independent, we can expand covariance expression. # $$ # \operatorname{Cov}(X, u)= \operatorname{Cov}((Z+w),(v-\beta_2w)) = \operatorname{Cov}(Z,v)+\operatorname{Cov}(w,v)+\operatorname{Cov}(Z,-\beta_2w)+\operatorname{Cov}(w,-\beta_2w) = -\beta_2\sigma_w^2 # $$ # Also expand variance at the denominator # $$ # \operatorname{Var}(X)=\operatorname{Var}(Z+w)=\operatorname{Var}(Z)+\operatorname{Var}(w)+2\operatorname{Cov}(Z,w)=\sigma_Z^2+\sigma_w^2 # $$ # Therefore # $$ # b_{2}=\frac{\operatorname{Cov}(X, Y)}{\operatorname{Var}(X)}=\beta_{2}-\frac{\beta_2\sigma_w^2}{\sigma_Z^2+\sigma_w^2} # $$ # This is how measurement error will affect estimates theoretically, the $b_2$ will be always biased downward. # We can show this with Monet Carlo simulation # In[261]: Z = np.arange(1, 21) b2_array = [] for i in range(10000): w = np.random.randn(20) v = np.random.randn(20) X = Z + w beta1, beta2 = 2, 3 Y = beta1 + beta2 * Z + v b2_array.append(sp.stats.linregress(X, Y)[0]) fig, ax = plt.subplots(figsize=(10, 7)) n, bins, patches = ax.hist(b2_array, bins=100) ax.axvline(x=np.mean(b2_array), lw=3, color="tomato") ax.set_xlabel("Sample Distribution of $b_2$") plt.show() # With $10000$ times of simulations, the estimates are always less than $3$, which is biased downward. # ## Measurement Error in Dependent Variables # If the true relationship is # $$ # Q_i = \beta_1+\beta_2X_1+v_i # $$ # However $Q_i$ cannot be precisely recorded, instead $Y_i$ is recorded, where # $$ # Y_i = Q_r + r_i # $$ # The true relationship can rewritten as # $$ # Y_i= \beta_1+\beta_2X_i+v_i+r_i # $$ # But note that $X_i$ is note affected, so OLS still provide unbiased and consistent estimates. # The composite disturbance term increase the the population variance of slope coefficient # $$ # \sigma_{b_{2}}^{2}=\frac{\sigma_{v}^{2}+\sigma_{r}^{2}}{n \operatorname{Var}(X)} # $$ # # Instrumental Variables Regression # We have discussed how $X$ can be correlated with disturbance term $u$, either due to omitted variables or measurement errors in independent variables. In next chapter we will also see how simultaneous equations model also cause interdependence between $X$ and $u$. # # Here we will discuss a general method **Instrumental Variable** (IV) Regression to obtain consistent estimator when $X$ is correlated with $u$. The idea of this method is extract the part of $X$ that is not correlated with $u$ and extraction is called IV, which can be used in obtaining consistent estimators. # Consider the model # $$ # Y_{i}=\beta_{1}+\beta_{2} X_{i}+u_{i} # $$ # where $X_i$ and $u_i$ are correlated, OLS estimators are inconsistent. IV method requires identify an instrument $Z_i$, which is correlated with $X_i$, but not with $u_i$. # # For the time being, we define that variables that correlated with disturbance term are called **endogenous variable**, otherwise called **exogenous variable**. In the context of simultaneous equation, we will come back to these terms again. # # To be a valid instrument, two conditions needs to be satisfied # $$ # \begin{aligned} # &\operatorname{Cov}\left(Z_{i}, X_{i}\right) \neq 0 \\ # &\operatorname{Cov}\left(Z_{i}, u_{i}\right)=0 # \end{aligned} # $$ # The philosophy of IV is to use $Z_i$ to capture the exogenous part of movements of $X_i$. # ## Two Stage Least Squares # If both condition satisfied, the estimation process with IV is called **Two Stage Least Square** (2SLS). # # _$1$st Stage_: decomposes $X$ into two components: a problematic component that may be correlated with the regression error and another problem-free component that is uncorrelated with the disturbance term. For simple linear regression model, the first stage begins with a regression model that links $X$ and $Z$ # # $$ # X_i = \alpha_1 + \alpha_2Z_i + v_i # $$ # # The problem-free component is the estimated values of $\hat{X}_i= a_1 + a_2 Z_i$, which is uncorrelated with $u_i$. # # _$2$nd Stage_: uses the problem-free component to estimate $\beta_2$. # # In the context of simple linear regression, regress $Y_i$ on $\hat{X}_i$ using OLS. The resulting estimators are 2SLS estimators. # If you can derive the formula of IV estimator, you won't need to go through these two steps. We can demonstrate how IV estimator of $\beta_2$ is derived. We start from the covariance of $Z_i$ and $Y_i$ # $$ # \operatorname{Cov}(Z_i, Y_i) = \operatorname{Cov}(Z_i, \beta_{1}+\beta_{2} X_{i}+u_{i}) = \operatorname{Cov}(Z_i, \beta_{0}) + \operatorname{Cov}(Z_i, \beta_{2} X_{i}) + \operatorname{Cov}(Z_i, u_{i}) =\beta_{2} \operatorname{Cov}(Z_i, X_{i}) # $$ # Rearrange the result, denote estimator as $b_2$ # $$ # b_2^{\mathrm{IV}} = \frac{\operatorname{Cov}(Z_i, Y_i)}{\operatorname{Cov}(Z_i, X_{i})} # $$ # To compare OLS and IV estimator for simple linear regression, here we reproduce the $b_2$ estimator of OLS # $$ # b_{2}^{\mathrm{OLS}}=\frac{\operatorname{Cov}(X, Y)}{\operatorname{Var}(X)}=\frac{\operatorname{Cov}(X, Y)}{\operatorname{Cov}(X, X)} # $$ # IV estimator is obtained by replacing an $X$ both in nominator and denominator. # $$ # \begin{aligned} # b_{2}^{\mathrm{IV}} &=\frac{\operatorname{Cov}(Z, Y)}{\operatorname{Cov}(Z, X)}=\frac{\operatorname{Cov}\left(Z,\left[\beta_{1}+\beta_{2} X+u\right]\right)}{\operatorname{Cov}(Z, X)} \\ # &=\frac{\operatorname{Cov}\left(Z, \beta_{1}\right)+\operatorname{Cov}\left(Z, \beta_{2} X\right)+\operatorname{Cov}(Z, u)}{\operatorname{Cov}(Z, X)} \\ # &=\beta_{2}+\frac{\operatorname{Cov}(Z, u)}{\operatorname{Cov}(Z, X)} # \end{aligned} # $$ # It tell that the accuracy of $b_{2}^{\mathrm{IV}}$ depends on relative quantity of covariance. In large sample, we expect IV estimator to be consistent # $$ # \operatorname{plim} b_{2}^{\mathrm{IV}}=\beta_{2}+\frac{\operatorname{plim} \operatorname{Cov}(Z, u)}{\operatorname{plim} \operatorname{Cov}(Z, X)}=\beta_{2}+\frac{0}{\sigma_{z x}}=\beta_{2} # $$ # We can also compare variance of OLS and IV from simple linear regression # $$ # \sigma_{b_{2}^{\mathrm{IV}}}^{2}=\frac{\sigma_{u}^{2}}{n \sigma_{X}^{2}} \times \frac{1}{r_{X Z}^{2}}\\ # \sigma_{b_{2}^{\mathrm{OLS}}}^{2}=\frac{\sigma_{u}^{2}}{n \sigma_{X}^{2}} # $$ # The greater the correlation between $X$ and $Z$, the smaller will be the variance of $\sigma_{b_{2}^{\mathrm{IV}}}^{2}$. # # We will walk through a numerical example in next chapter, after discussing the identification issue. #