#!/usr/bin/env python
# coding: utf-8
# # Chapter 3 - Linear Regression
# - [Lab 3.6.2 Simple Linear Regression](#lab-3.6.2)
# - [Lab 3.6.3 Multiple Linear Regression](#lab-3.6.3)
# - [Lab 3.6.4 Interaction Terms](#lab-3.6.4)
# - [Lab 3.6.5 Non-linear Transformations of the Predictors](#lab-3.6.5)
# - [Lab 3.6.6 Qualitative Predictors](#lab-3.6.6)
# - [Exercise 8](#ex-8)
# - [Exercise 9](#ex-9)
# ### Imports and Configurations
# In[1]:
# Standard imports
import warnings
# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# StatsModels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from statsmodels.stats.anova import anova_lm
# scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Visulization
from IPython.display import display, HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
sns.set_style('darkgrid')
import statsmodels.graphics.api as smg
# ### Utility Functions Definitions
# In[2]:
def ortho_poly_fit(x, degree = 1):
'''
Convert data into orthogonal basis for polynomial regression by QR decomposition.
Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
'''
n = degree + 1
x = np.asarray(x).flatten()
if(degree >= len(np.unique(x))):
stop("'degree' must be less than number of unique points")
xbar = np.mean(x)
x = x - xbar
X = np.fliplr(np.vander(x, n))
q,r = np.linalg.qr(X)
z = np.diag(np.diag(r))
raw = np.dot(q, z)
norm2 = np.sum(raw**2, axis=0)
alpha = (np.sum((raw**2)*np.reshape(x,(-1,1)), axis=0)/norm2 + xbar)[:degree]
Z = raw / np.sqrt(norm2)
return Z, norm2, alpha
def ortho_poly_predict(x, alpha, norm2, degree = 1):
'''
Convert new data for prediction into orthogonal basis, based on input parameters.
Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
'''
x = np.asarray(x).flatten()
n = degree + 1
Z = np.empty((len(x), n))
Z[:,0] = 1
if degree > 0:
Z[:, 1] = x - alpha[0]
if degree > 1:
for i in np.arange(1,degree):
Z[:, i+1] = (x - alpha[i]) * Z[:, i] - (norm2[i] / norm2[i-1]) * Z[:, i-1]
Z /= np.sqrt(norm2)
return Z
#
# ### Lab 3.6.2 Simple Linear Regression
# In[3]:
# Import Boston house price data from R package MASS
# This dataset is also available at https://archive.ics.uci.edu/ml/datasets/Housing
mass = importr('MASS')
boston_rdf = rdata(mass).fetch('Boston')['Boston']
boston = pandas2ri.ri2py(boston_rdf)
# In[4]:
# boston.describe # Head and tails in text.
# display(boston) # Head and tail in a data table.
HTML(boston.to_html(max_rows=60)) # All rows, or a subset in a data table.
# In[5]:
boston.info() # Summary of the data
# In[6]:
# Simple linear regression and plot using seaborn
sns.lmplot(x='lstat', y='medv', data=boston, scatter_kws={'color':'r', 's':15});
plt.xlim(xmin=0)
plt.ylim(ymin=0)
sns.plt.show()
plt.figure()
sns.residplot(x='lstat', y='medv', data=boston)
sns.plt.show()
# In[7]:
# Simple linear regression with scikit-learn
boston_sk = LinearRegression()
nrow = boston.shape[0]
boston_sk.fit(boston['lstat'].values.reshape(nrow, 1), boston['medv'].values.reshape(nrow, 1))
print("Slope: {}, Intercept: {}".format(boston_sk.coef_, boston_sk.intercept_))
# In[8]:
# Simple linear regression with StatsModels
boston_smslr = smf.ols('medv ~ lstat', data=boston).fit()
print(boston_smslr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smslr.mse_resid))
print("\nResiduals:")
display(boston_smslr.resid.describe()[3:])
# In[9]:
# Plot StatsModels OLS fitting results
fig, ax = plt.subplots()
plt.scatter(boston.lstat, boston.medv, color='r', s=15, alpha=0.5) # Data points
smg.abline_plot(model_results=boston_smslr, ax=ax) # Regression line
plt.xlim(0, 40)
plt.ylim(0, 55)
plt.xlabel('lstat')
plt.ylabel('medv')
plt.show()
# In[10]:
# 97.5% confidence interval for the coefficient estimates
ci = boston_smslr.conf_int(0.025)
ci.columns = ['2.5%', '97.5%']
print(ci)
# We can see the confidence interval computed by StatsModels is slightly difference than R.
# In[11]:
# Prediction (To be finished)
fit = boston_smslr.predict(exog=dict(lstat=[5,10,15]))
print(fit)
# Prediction Intervals
#from statsmodels.sandbox.regression.predstd import wls_prediction_std
#prstd, iv_l, iv_u = wls_prediction_std(boston_smslr)
#print(prstd,iv_l,iv_u)
# Confidence Intervals
# References:
# http://stackoverflow.com/questions/17559408/confidence-and-prediction-intervals-with-statsmodels
# http://stackoverflow.com/questions/20572706/mathematical-background-of-statsmodels-wls-prediction-std
# http://markthegraph.blogspot.com/2015/05/using-python-statsmodels-for-ols-linear.html
# http://statsmodels.sourceforge.net/devel/examples/notebooks/generated/wls.html?highlight=wls_prediction_std
# In[12]:
# Diagnostic plots for simple linear regression
infl = boston_smslr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(boston_smslr.fittedvalues, boston_smslr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(boston_smslr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(boston_smslr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
# In[13]:
# Find the data index with the maximum leverage statistics
infl.hat_matrix_diag.argmax() + 1
#
# ### Lab 3.6.3 Multiple Linear Regression
# In[14]:
# Regression with two predictors on Boston dataset
boston_smmlr = smf.ols('medv ~ lstat + age', data=boston).fit()
print(boston_smmlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr.mse_resid))
print("\nResiduals:")
print(boston_smmlr.resid.describe()[3:])
# In[15]:
# Regression with all predictors on Boston dataset
all_predictors = '+'.join(boston.columns.drop('medv'))
boston_smmlr_all = smf.ols('medv ~ ' + all_predictors, data=boston).fit()
print(boston_smmlr_all.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_all.mse_resid))
print("\nResiduals:")
print(boston_smmlr_all.resid.describe()[3:])
# In[16]:
# Compute variance inflation factors
exog = boston_smmlr_all.model.exog
exog_names = boston_smmlr_all.model.exog_names
vif_df = pd.DataFrame([vif(exog, ix) for ix in range(1, exog.shape[1])]).transpose()
vif_df.columns = exog_names[1:]
display(vif_df)
#
# ### Lab 3.6.4 Interaction Terms
# In[17]:
# Regression with two interaction terms on Boston dataset
boston_smmlr_inter = smf.ols('medv ~ lstat * age', data=boston).fit()
print(boston_smmlr_inter.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_inter.mse_resid))
print("\nResiduals:")
print(boston_smmlr_inter.resid.describe()[3:])
#
# ### Lab 3.6.5 Non-linear Transformations of the Predictors
# In[18]:
# Regression with quandratic term on Boston dataset
boston_smmlr_quad = smf.ols('medv ~ lstat + I(lstat**2)', data=boston).fit()
print(boston_smmlr_quad.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_quad.mse_resid))
print("\nResiduals:")
print(boston_smmlr_quad.resid.describe()[3:])
# In[19]:
# ANOVA for both simple linear and quandratic fits
with warnings.catch_warnings():
warnings.filterwarnings("ignore") ## Supress warnings
display(anova_lm(boston_smslr, boston_smmlr_quad))
# In[20]:
# Diagnostic plots for quandratic fit
infl = boston_smmlr_quad.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(boston_smmlr_quad.fittedvalues, boston_smmlr_quad.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_external, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(boston_smmlr_quad.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(boston_smslr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
# In[21]:
# Regression with polynomial terms on Boston dataset
# poly() function in R produces orthogonal polynomials with QR decomposition by default
# Without orthogonization, powers of X are correlated, and regression on correlated predictors
# leads tounstable coefficients.
# In addition, large powers of X will require very small regression coefficients,
# potentially leading to underflow and coefficients that are hard to interpret or compare intuitively.
# Ref: http://davmre.github.io/python/2013/12/15/orthogonal_poly
poly_degree = 5
boston_smmlr_poly = smf.ols('medv ~ ortho_poly_fit(lstat, poly_degree)[0][:, 1:]', data=boston).fit()
print(boston_smmlr_poly.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(boston_smmlr_poly.mse_resid))
print("\nResiduals:")
print(boston_smmlr_poly.resid.describe()[3:])
#
# ### Lab 3.6.6 Qualitative Predictors
# In[22]:
# Import Carseats data from R package ISLR
islr = importr('ISLR')
carseats_rdf = rdata(islr).fetch('Carseats')['Carseats']
carseats = pandas2ri.ri2py(carseats_rdf)
display(carseats)
# In[23]:
# Regression with all terms plus two interaction terms on Carseats dataset
all_predictors = '+'.join(carseats.columns.drop('Sales'))
carseats_smmlr = smf.ols('Sales ~ ' + all_predictors + ' + Income:Advertising + Price:Age', data=carseats).fit()
print(carseats_smmlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(carseats_smmlr.mse_resid))
print("\nResiduals:")
print(carseats_smmlr.resid.describe()[3:])
# In[24]:
# Show contrast matrix, the dummy variable coding
from patsy.contrasts import Treatment
levels = list(carseats.ShelveLoc.unique())
contrast = Treatment(reference=0).code_without_intercept(levels)
cm_df = pd.DataFrame(contrast.matrix, columns=contrast.column_suffixes, index=levels, dtype=int)
display(cm_df)
#
# ### Exercise 8
# In[25]:
# Auto dataset is in R ISLR package
islr = importr('ISLR')
auto_rdf = rdata(islr).fetch('Auto')['Auto']
auto = pandas2ri.ri2py(auto_rdf)
display(auto)
# In[26]:
# simple linear regression on Auto dataset
auto_slr = smf.ols('mpg ~ horsepower ', data=auto).fit()
print(auto_slr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(auto_slr.mse_resid))
print("\nResiduals:")
print(auto_slr.resid.describe()[3:])
# In[27]:
# 8 (a) - iv. Prediction
auto_pred = auto_slr.predict(exog=dict(horsepower=98))
print(auto_pred)
# In[28]:
# Plot SLR fitting results
fig, ax = plt.subplots()
plt.scatter(auto.horsepower, auto.mpg, color='r', s=15, alpha=0.5) # Data points
smg.abline_plot(model_results=auto_slr, ax=ax) # Regression line
plt.xlim(25, 250)
plt.ylim(5, 50)
plt.xlabel('Horsepower')
plt.ylabel('MPG')
plt.title('Simple Linear Regression on Auto dataset')
plt.show()
# In[29]:
# Diagnostic plots
infl = auto_slr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(auto_slr.fittedvalues, auto_slr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_internal, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(auto_slr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(auto_slr, size=20, ax=ax4)
plt.xlim(0, 0.03)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
#
# ### Exercise 9
# In[30]:
# 9 - (a) Plot scatter matrix.
sns.pairplot(auto, size=2)
plt.show()
# In[31]:
# 9 - (b) Compute correlation matrix.
auto.corr()
# In[32]:
# 9 - (c) Multiple linear regression on Auto dataset.
formula = 'mpg ~ ' + ' + '.join(auto.columns.drop(['name', 'mpg']))
print('\nRegression formula: {}\n'.format(formula))
auto_mlr = smf.ols(formula, data=auto).fit()
print(auto_mlr.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(auto_mlr.mse_resid))
print("\nResiduals:")
print(auto_mlr.resid.describe()[3:])
# In[33]:
# Diagnostic plots
infl = auto_mlr.get_influence()
fig, ax = plt.subplots(2, 2, figsize=(12,12))
# 1. Residuals. vs. fitted values
ax1 = plt.subplot(221)
plt.scatter(auto_mlr.fittedvalues, auto_mlr.resid, alpha=0.75)
plt.title('Residuals vs Fitted')
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
# 2. Normal Q-Q plot
ax2 = plt.subplot(222)
smg.qqplot(infl.resid_studentized_internal, line='q', ax=ax2)
plt.title('Normal Q-Q')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Standardized residuals')
# 3. Square-root absolute standardized residuals vs. fitted values
ax3 = plt.subplot(223)
plt.scatter(auto_mlr.fittedvalues, np.sqrt(np.absolute(infl.resid_studentized_external)), alpha=0.75)
plt.title('Scale-Location')
plt.xlabel('Fitted values')
plt.ylabel(r'$\sqrt{|Studentized\ Residuals|}$')
# 4. Standardized residuals vs. leverage statistics
ax4 = plt.subplot(224)
smg.influence_plot(auto_mlr, size=20, ax=ax4)
plt.xlim(0, 0.2)
plt.xlabel('Leverage')
plt.ylabel('Standardized residuals')
plt.title('Residuals vs Leverage')
plt.tight_layout()
plt.show()
# In[34]:
# 9 - (e) Multiple linear regression with interaction terms on Auto dataset.
formula = 'mpg ~ ' + ' + '.join(auto.columns.drop(['name', 'mpg'])) + ' + horsepower:weight'
print('\nRegression formula: {}\n'.format(formula))
auto_mlr_inter = smf.ols(formula, data=auto).fit()
print(auto_mlr_inter.summary())
# Display residual SE and quantiles not shown in SM summary
print("\nResidual Standard Error:")
print(np.sqrt(auto_mlr_inter.mse_resid))
print("\nResiduals:")
print(auto_mlr_inter.resid.describe()[3:])