#!/usr/bin/env python # coding: utf-8 # # Chapter 3 - Linear Regression # - [Lab 3.6.2 Simple Linear Regression](#lab-3.6.2) # - [Lab 3.6.3 Multiple Linear Regression](#lab-3.6.3) # - [Lab 3.6.4 Interaction Terms](#lab-3.6.4) # - [Lab 3.6.5 Non-linear Transformations of the Predictors](#lab-3.6.5) # - [Lab 3.6.6 Qualitative Predictors](#lab-3.6.6) # - [Exercise 8](#ex-8) # - [Exercise 9](#ex-9) # ### Imports and Configurations # In[1]: # Standard imports import warnings # Use rpy2 for loading R datasets from rpy2.robjects.packages import importr from rpy2.robjects.packages import data as rdata from rpy2.robjects import pandas2ri # Math and data processing import numpy as np import scipy as sp import pandas as pd # StatsModels import statsmodels.api as sm import statsmodels.formula.api as smf from statsmodels.stats.outliers_influence import variance_inflation_factor as vif from statsmodels.stats.anova import anova_lm # scikit-learn from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score # Visulization from IPython.display import display, HTML import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') sns.set_style('darkgrid') import statsmodels.graphics.api as smg # ### Utility Functions Definitions # In[2]: def ortho_poly_fit(x, degree = 1): ''' Convert data into orthogonal basis for polynomial regression by QR decomposition. Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly ''' n = degree + 1 x = np.asarray(x).flatten() if(degree >= len(np.unique(x))): stop("'degree' must be less than number of unique points") xbar = np.mean(x) x = x - xbar X = np.fliplr(np.vander(x, n)) q,r = np.linalg.qr(X) z = np.diag(np.diag(r)) raw = np.dot(q, z) norm2 = np.sum(raw**2, axis=0) alpha = (np.sum((raw**2)*np.reshape(x,(-1,1)), axis=0)/norm2 + xbar)[:degree] Z = raw / np.sqrt(norm2) return Z, norm2, alpha def ortho_poly_predict(x, alpha, norm2, degree = 1): ''' Convert new data for prediction into orthogonal basis, based on input parameters. Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly ''' x = np.asarray(x).flatten() n = degree + 1 Z = np.empty((len(x), n)) Z[:,0] = 1 if degree > 0: Z[:, 1] = x - alpha[0] if degree > 1: for i in np.arange(1,degree): Z[:, i+1] = (x - alpha[i]) * Z[:, i] - (norm2[i] / norm2[i-1]) * Z[:, i-1] Z /= np.sqrt(norm2) return Z # # ### Lab 3.6.2 Simple Linear Regression # In[3]: # Import Boston house price data from R package MASS # This dataset is also available at https://archive.ics.uci.edu/ml/datasets/Housing mass = importr('MASS') boston_rdf = rdata(mass).fetch('Boston')['Boston'] boston = pandas2ri.ri2py(boston_rdf) # In[4]: # boston.describe # Head and tails in text. # display(boston) # Head and tail in a data table. HTML(boston.to_html(max_rows=60)) # All rows, or a subset in a data table. # In[5]: boston.info() # Summary of the data # In[6]: # Simple linear regression and plot using seaborn sns.lmplot(x='lstat', y='medv', data=boston, scatter_kws={'color':'r', 's':15}); plt.xlim(xmin=0) plt.ylim(ymin=0) sns.plt.show() plt.figure() sns.residplot(x='lstat', y='medv', data=boston) sns.plt.show() # In[7]: # Simple linear regression with scikit-learn boston_sk = LinearRegression() nrow = boston.shape[0] boston_sk.fit(boston['lstat'].values.reshape(nrow, 1), boston['medv'].values.reshape(nrow, 1)) print("Slope: {}, Intercept: {}".format(boston_sk.coef_, boston_sk.intercept_)) # In[8]: # Simple linear regression with StatsModels boston_smslr = smf.ols('medv ~ lstat', data=boston).fit() print(boston_smslr.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(boston_smslr.mse_resid)) print("\nResiduals:") display(boston_smslr.resid.describe()[3:]) # In[9]: # Plot StatsModels OLS fitting results fig, ax = plt.subplots() plt.scatter(boston.lstat, boston.medv, color='r', s=15, alpha=0.5) # Data points smg.abline_plot(model_results=boston_smslr, ax=ax) # Regression line plt.xlim(0, 40) plt.ylim(0, 55) plt.xlabel('lstat') plt.ylabel('medv') plt.show() # In[10]: # 97.5% confidence interval for the coefficient estimates ci = boston_smslr.conf_int(0.025) ci.columns = ['2.5%', '97.5%'] print(ci) # We can see the confidence interval computed by StatsModels is slightly difference than R. # In[11]: # Prediction (To be finished) fit = boston_smslr.predict(exog=dict(lstat=[5,10,15])) print(fit) # Prediction Intervals #from statsmodels.sandbox.regression.predstd import wls_prediction_std #prstd, iv_l, iv_u = wls_prediction_std(boston_smslr) #print(prstd,iv_l,iv_u) # Confidence Intervals # References: # http://stackoverflow.com/questions/17559408/confidence-and-prediction-intervals-with-statsmodels # http://stackoverflow.com/questions/20572706/mathematical-background-of-statsmodels-wls-prediction-std # http://markthegraph.blogspot.com/2015/05/using-python-statsmodels-for-ols-linear.html # http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/wls.html?highlight=wls_prediction_std # In[12]: # Diagnostic plots for simple linear regression infl = boston_smslr.get_influence() fig, ax = plt.subplots(2, 2, figsize=(12,12)) # 1. Residuals. vs. fitted values ax1 = plt.subplot(221) plt.scatter(boston_smslr.fittedvalues, boston_smslr.resid, alpha=0.75) plt.title('Residuals vs Fitted') plt.xlabel('Fitted values') plt.ylabel('Residuals') # 2. Normal Q-Q plot ax2 = plt.subplot(222) smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2) plt.title('Normal Q-Q') plt.xlabel('Theoretical Quantiles') plt.ylabel('Standardized residuals') # 3. Square-root absolute standardized residuals vs. fitted values ax3 = plt.subplot(223) plt.scatter(boston_smslr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75) plt.title('Scale-Location') plt.xlabel('Fitted values') plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$') # 4. Standardized residuals vs. leverage statistics ax4 = plt.subplot(224) smg.influence_plot(boston_smslr, size=20, ax=ax4) plt.xlim(0, 0.03) plt.xlabel('Leverage') plt.ylabel('Standardized residuals') plt.title('Residuals vs Leverage') plt.tight_layout() plt.show() # In[13]: # Find the data index with the maximum leverage statistics infl.hat_matrix_diag.argmax() + 1 # # ### Lab 3.6.3 Multiple Linear Regression # In[14]: # Regression with two predictors on Boston dataset boston_smmlr = smf.ols('medv ~ lstat + age', data=boston).fit() print(boston_smmlr.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(boston_smmlr.mse_resid)) print("\nResiduals:") print(boston_smmlr.resid.describe()[3:]) # In[15]: # Regression with all predictors on Boston dataset all_predictors = '+'.join(boston.columns.drop('medv')) boston_smmlr_all = smf.ols('medv ~ ' + all_predictors, data=boston).fit() print(boston_smmlr_all.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(boston_smmlr_all.mse_resid)) print("\nResiduals:") print(boston_smmlr_all.resid.describe()[3:]) # In[16]: # Compute variance inflation factors exog = boston_smmlr_all.model.exog exog_names = boston_smmlr_all.model.exog_names vif_df = pd.DataFrame([vif(exog, ix) for ix in range(1, exog.shape[1])]).transpose() vif_df.columns = exog_names[1:] display(vif_df) # # ### Lab 3.6.4 Interaction Terms # In[17]: # Regression with two interaction terms on Boston dataset boston_smmlr_inter = smf.ols('medv ~ lstat * age', data=boston).fit() print(boston_smmlr_inter.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(boston_smmlr_inter.mse_resid)) print("\nResiduals:") print(boston_smmlr_inter.resid.describe()[3:]) # # ### Lab 3.6.5 Non-linear Transformations of the Predictors # In[18]: # Regression with quandratic term on Boston dataset boston_smmlr_quad = smf.ols('medv ~ lstat + I(lstat**2)', data=boston).fit() print(boston_smmlr_quad.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(boston_smmlr_quad.mse_resid)) print("\nResiduals:") print(boston_smmlr_quad.resid.describe()[3:]) # In[19]: # ANOVA for both simple linear and quandratic fits with warnings.catch_warnings(): warnings.filterwarnings("ignore") ## Supress warnings display(anova_lm(boston_smslr, boston_smmlr_quad)) # In[20]: # Diagnostic plots for quandratic fit infl = boston_smmlr_quad.get_influence() fig, ax = plt.subplots(2, 2, figsize=(12,12)) # 1. Residuals. vs. fitted values ax1 = plt.subplot(221) plt.scatter(boston_smmlr_quad.fittedvalues, boston_smmlr_quad.resid, alpha=0.75) plt.title('Residuals vs Fitted') plt.xlabel('Fitted values') plt.ylabel('Residuals') # 2. Normal Q-Q plot ax2 = plt.subplot(222) smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2) plt.title('Normal Q-Q') plt.xlabel('Theoretical Quantiles') plt.ylabel('Standardized residuals') # 3. Square-root absolute standardized residuals vs. fitted values ax3 = plt.subplot(223) plt.scatter(boston_smmlr_quad.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75) plt.title('Scale-Location') plt.xlabel('Fitted values') plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$') # 4. Standardized residuals vs. leverage statistics ax4 = plt.subplot(224) smg.influence_plot(boston_smslr, size=20, ax=ax4) plt.xlim(0, 0.03) plt.xlabel('Leverage') plt.ylabel('Standardized residuals') plt.title('Residuals vs Leverage') plt.tight_layout() plt.show() # In[21]: # Regression with polynomial terms on Boston dataset # poly() function in R produces orthogonal polynomials with QR decomposition by default # Without orthogonization, powers of X are correlated, and regression on correlated predictors # leads tounstable coefficients. # In addition, large powers of X will require very small regression coefficients, # potentially leading to underflow and coefficients that are hard to interpret or compare intuitively. # Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly poly_degree = 5 boston_smmlr_poly = smf.ols('medv ~ ortho_poly_fit(lstat, poly_degree)[0][:, 1:]', data=boston).fit() print(boston_smmlr_poly.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(boston_smmlr_poly.mse_resid)) print("\nResiduals:") print(boston_smmlr_poly.resid.describe()[3:]) # # ### Lab 3.6.6 Qualitative Predictors # In[22]: # Import Carseats data from R package ISLR islr = importr('ISLR') carseats_rdf = rdata(islr).fetch('Carseats')['Carseats'] carseats = pandas2ri.ri2py(carseats_rdf) display(carseats) # In[23]: # Regression with all terms plus two interaction terms on Carseats dataset all_predictors = '+'.join(carseats.columns.drop('Sales')) carseats_smmlr = smf.ols('Sales ~ ' + all_predictors + ' + Income:Advertising + Price:Age', data=carseats).fit() print(carseats_smmlr.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(carseats_smmlr.mse_resid)) print("\nResiduals:") print(carseats_smmlr.resid.describe()[3:]) # In[24]: # Show contrast matrix, the dummy variable coding from patsy.contrasts import Treatment levels = list(carseats.ShelveLoc.unique()) contrast = Treatment(reference=0).code_without_intercept(levels) cm_df = pd.DataFrame(contrast.matrix, columns=contrast.column_suffixes, index=levels, dtype=int) display(cm_df) # # ### Exercise 8 # In[25]: # Auto dataset is in R ISLR package islr = importr('ISLR') auto_rdf = rdata(islr).fetch('Auto')['Auto'] auto = pandas2ri.ri2py(auto_rdf) display(auto) # In[26]: # simple linear regression on Auto dataset auto_slr = smf.ols('mpg ~ horsepower ', data=auto).fit() print(auto_slr.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(auto_slr.mse_resid)) print("\nResiduals:") print(auto_slr.resid.describe()[3:]) # In[27]: # 8 (a) - iv. Prediction auto_pred = auto_slr.predict(exog=dict(horsepower=98)) print(auto_pred) # In[28]: # Plot SLR fitting results fig, ax = plt.subplots() plt.scatter(auto.horsepower, auto.mpg, color='r', s=15, alpha=0.5) # Data points smg.abline_plot(model_results=auto_slr, ax=ax) # Regression line plt.xlim(25, 250) plt.ylim(5, 50) plt.xlabel('Horsepower') plt.ylabel('MPG') plt.title('Simple Linear Regression on Auto dataset') plt.show() # In[29]: # Diagnostic plots infl = auto_slr.get_influence() fig, ax = plt.subplots(2, 2, figsize=(12,12)) # 1. Residuals. vs. fitted values ax1 = plt.subplot(221) plt.scatter(auto_slr.fittedvalues, auto_slr.resid, alpha=0.75) plt.title('Residuals vs Fitted') plt.xlabel('Fitted values') plt.ylabel('Residuals') # 2. Normal Q-Q plot ax2 = plt.subplot(222) smg.qqplot(infl.resid_studentized_internal, line='q', ax=ax2) plt.title('Normal Q-Q') plt.xlabel('Theoretical Quantiles') plt.ylabel('Standardized residuals') # 3. Square-root absolute standardized residuals vs. fitted values ax3 = plt.subplot(223) plt.scatter(auto_slr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75) plt.title('Scale-Location') plt.xlabel('Fitted values') plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$') # 4. Standardized residuals vs. leverage statistics ax4 = plt.subplot(224) smg.influence_plot(auto_slr, size=20, ax=ax4) plt.xlim(0, 0.03) plt.xlabel('Leverage') plt.ylabel('Standardized residuals') plt.title('Residuals vs Leverage') plt.tight_layout() plt.show() # # ### Exercise 9 # In[30]: # 9 - (a) Plot scatter matrix. sns.pairplot(auto, size=2) plt.show() # In[31]: # 9 - (b) Compute correlation matrix. auto.corr() # In[32]: # 9 - (c) Multiple linear regression on Auto dataset. formula = 'mpg ~ ' + ' + '.join(auto.columns.drop(['name', 'mpg'])) print('\nRegression formula: {}\n'.format(formula)) auto_mlr = smf.ols(formula, data=auto).fit() print(auto_mlr.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(auto_mlr.mse_resid)) print("\nResiduals:") print(auto_mlr.resid.describe()[3:]) # In[33]: # Diagnostic plots infl = auto_mlr.get_influence() fig, ax = plt.subplots(2, 2, figsize=(12,12)) # 1. Residuals. vs. fitted values ax1 = plt.subplot(221) plt.scatter(auto_mlr.fittedvalues, auto_mlr.resid, alpha=0.75) plt.title('Residuals vs Fitted') plt.xlabel('Fitted values') plt.ylabel('Residuals') # 2. Normal Q-Q plot ax2 = plt.subplot(222) smg.qqplot(infl.resid_studentized_internal, line='q', ax=ax2) plt.title('Normal Q-Q') plt.xlabel('Theoretical Quantiles') plt.ylabel('Standardized residuals') # 3. Square-root absolute standardized residuals vs. fitted values ax3 = plt.subplot(223) plt.scatter(auto_mlr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75) plt.title('Scale-Location') plt.xlabel('Fitted values') plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$') # 4. Standardized residuals vs. leverage statistics ax4 = plt.subplot(224) smg.influence_plot(auto_mlr, size=20, ax=ax4) plt.xlim(0, 0.2) plt.xlabel('Leverage') plt.ylabel('Standardized residuals') plt.title('Residuals vs Leverage') plt.tight_layout() plt.show() # In[34]: # 9 - (e) Multiple linear regression with interaction terms on Auto dataset. formula = 'mpg ~ ' + ' + '.join(auto.columns.drop(['name', 'mpg'])) + ' + horsepower:weight' print('\nRegression formula: {}\n'.format(formula)) auto_mlr_inter = smf.ols(formula, data=auto).fit() print(auto_mlr_inter.summary()) # Display residual SE and quantiles not shown in SM summary print("\nResidual Standard Error:") print(np.sqrt(auto_mlr_inter.mse_resid)) print("\nResiduals:") print(auto_mlr_inter.resid.describe()[3:])