#!/usr/bin/env python
# coding: utf-8
# # Chapter 4 - Classification
# - [Lab 4.6.1 The Stock Market Data](#lab-4.6.1)
# - [Lab 4.6.2 Logistic Regression](#lab-4.6.2)
# - [Lab 4.6.3 Linear Discriminant Analysis](#lab-4.6.3)
# - [Lab 4.6.4 Quadratic Discriminant Analysis](#lab-4.6.4)
# - [Lab 4.6.5 K-Nearest Neighbors](#lab-4.6.5)
# - [Lab 4.6.6 An Application to Caravan Insurance Data](#lab-4.6.6)
# ### Imports and Configurations
# In[1]:
# Standard imports
import warnings
# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# StatsModels
import statsmodels.api as sm
import statsmodels.formula.api as smf
# scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix, classification_report
# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
mpl.style.use('ggplot')
import statsmodels.graphics.api as smg
#
# ### Lab 4.6.1 The Stock Market Data
# In[2]:
# Import Smarket dataset from R package ISLR
islr = importr('ISLR')
smarket_rdf = rdata(islr).fetch('Smarket')['Smarket']
smarket = pandas2ri.ri2py(smarket_rdf)
# In[3]:
# Display dataset structures and statistics
display(smarket.head())
display(smarket.info())
display(smarket.describe())
# In[4]:
# Correlation matrix
display(smarket.corr())
# In[5]:
# Plot Smarket volumes
ax = smarket.plot('Year', 'Volume', kind='bar', figsize=(15,6), color='k')
# Remove redundant xtick labels
xticklabels = ['',] * smarket.shape[0]
xtext, xlocs = np.unique(smarket.Year, return_index=True)
for t, i in zip(xtext, xlocs):
xticklabels[i] = str(int(t))
ax.set_xticklabels(xticklabels, rotation=0)
ax.set_ylabel('Volume')
ax.set_title('S&P 500 volume over 1250 days.')
plt.show()
#
# ### Lab 4.6.2 Logistic Regression
# ##### StatsModels
# In[6]:
# Logistic regression by GLM
formula = 'Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume'
smarket_glm = smf.glm(formula, data=smarket, family=sm.families.Binomial()).fit()
print('Deviance Residuals:')
display(smarket_glm.resid_deviance.describe())
print(smarket_glm.summary())
print('\n Null deviance: {0:.1f} on {1} degrees of freedom'.format(smarket_glm.null_deviance, smarket_glm.df_model+smarket_glm.df_resid))
print('Residual deviance: {0:.1f} on {1} degrees of freedom'.format(smarket_glm.deviance, smarket_glm.df_resid))
print('AIC: {0:.2f}'.format(smarket_glm.aic))
# ##### scikit-learn LogisticRegression
# In[7]:
# Specify features and response
features = ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']
response = 'Direction'
# Fit on the whole dataset
X = smarket[features]
y = smarket[response]
logreg = LogisticRegression(C=1e9) # Use a large C to disable regularization
logreg.fit(X, y)
# Extract coefficients from fitting results
coef = pd.DataFrame(logreg.coef_, columns=features)
coef.insert(loc=0, column='(Intercept)', value=logreg.intercept_)
coef.index=['']
display(coef)
# In[8]:
# In-sample prediction with type='response', or P(Y=1|X)
print("First ten in-sample prediction probabilities P(Y=1|X): ")
display(logreg.predict_proba(X)[0:10, 1])
# In-sample prediction with decisions
y_pred = logreg.predict(X)
print("In-sample prediction decision results: ")
display(y_pred[0:10])
# In[9]:
# Evaluate accruacy by confusion matrix and score
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_])
cfmat = confusion_matrix(y, y_pred, labels=logreg.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', logreg.score(X, y))
# In[10]:
# Manual train-test split
smarket_train = smarket[smarket.Year!=2005]
smarket_test = smarket[smarket.Year==2005]
print("Training dataset shape: ", smarket_train.shape)
print("Test dataset shape: ", smarket_test.shape)
# In[11]:
# Specify features and response
features = ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']
response = 'Direction'
# Fit on training data subset
X = smarket_train[features]
y = smarket_train[response]
logreg = LogisticRegression(C=1e9) # Use a large C to disable regularization
logreg.fit(X, y)
# Prediction on test data subset
X = smarket_test[features]
y = smarket_test[response]
y_pred = logreg.predict(X)
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_])
cfmat = confusion_matrix(y, y_pred, labels=logreg.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', logreg.score(X, y))
# In[12]:
# Specify features and response
features = ['Lag1', 'Lag2']
response = 'Direction'
# Improve prediction by removing features with large p-values
X = smarket_train[features]
y = smarket_train[response]
logreg = LogisticRegression(C=1e9) # Use a large C to disable regularization
logreg.fit(X, y)
# Prediction on test data subset
X = smarket_test[features]
y = smarket_test[response]
y_pred = logreg.predict(X)
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_])
cfmat = confusion_matrix(y, y_pred, labels=logreg.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', logreg.score(X, y))
#
# ### Lab 4.6.3 Linear Discriminant Analysis
# In[13]:
# Specify features and response
features = ['Lag1', 'Lag2']
response = 'Direction'
# Fit on training subset
X = smarket_train[features]
y = smarket_train[response]
lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
# Priors, group means, and coefficients of linear discriminants
priors = pd.DataFrame(lda.priors_, index=lda.classes_, columns=['']).T
print("Prior probabilities of groups:")
display(priors)
gmeans = pd.DataFrame(lda.means_, index=lda.classes_, columns=features)
print("\nGroup means:")
display(gmeans)
coef = pd.DataFrame(lda.scalings_, columns=['LD1'], index=features)
print("\nCoefficients of linear discriminants:")
display(coef)
# In[14]:
# Plot linear discriminants of the LDA training fit
smarket_grouped = smarket_train.groupby('Direction')
smarket_down = smarket_grouped.get_group('Down')
smarket_up = smarket_grouped.get_group('Up')
discrim_down = lda.transform(smarket_down[['Lag1', 'Lag2']])
discrim_up = lda.transform(smarket_up[['Lag1', 'Lag2']])
plt.figure(figsize=(12,6))
plt.subplot(2, 1, 1)
plt.hist(discrim_down, 16, normed=True, color='c')
plt.title('group Down')
plt.xlim(-5, 5)
plt.subplot(2, 1, 2)
plt.hist(discrim_up, 16, normed=True, color='c')
plt.title('group Up')
plt.xlim(-5, 5)
plt.show()
# In[15]:
# Prediction on test subset
X = smarket_test[['Lag1', 'Lag2']]
y = smarket_test['Direction']
y_pred = lda.predict(X)
# Generate posterior probability matrix
posterior = pd.DataFrame(lda.predict_proba(X), columns=lda.classes_)
# Generate linear discriminants on the test subset
discrim_test = lda.transform(X)
# In[16]:
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], lda.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], lda.classes_])
cfmat = confusion_matrix(y, y_pred, labels=lda.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', lda.score(X, y))
# In[17]:
# First 20 posterior probabilities
print('Posterior probabilities:')
print(posterior.iloc[:20])
# First 20 prediction results
print('\nFirst 20 prediction results: \n', y_pred[:20])
# Number of Down class with threshold changed to 0.9
print('\nNumber of Down class with threshold = 0.9: ', sum(posterior['Down'] > 0.9))
#
# ### Lab 4.6.4 Quadratic Discriminant Analysis
# In[18]:
# Specify features and response
features = ['Lag1', 'Lag2']
response = 'Direction'
# Fit on training subset
X = smarket_train[features]
y = smarket_train[response]
qda = QuadraticDiscriminantAnalysis()
qda.fit(X, y)
# Priors, group means, and coefficients of linear discriminants
priors = pd.DataFrame(qda.priors_, index=qda.classes_, columns=['']).T
print("Prior probabilities of groups:")
display(priors)
gmeans = pd.DataFrame(qda.means_, index=qda.classes_, columns=features)
print("\nGroup means:")
display(gmeans)
coef = pd.DataFrame(qda.scalings_, columns=['QD1', 'QD2'], index=features)
print("\nCoefficients of quadratic discriminants:")
display(coef)
# In[19]:
# Prediction on test data subset
X = smarket_test[features]
y = smarket_test[response]
y_pred = qda.predict(X)
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], qda.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], qda.classes_])
cfmat = confusion_matrix(y, y_pred, labels=qda.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', qda.score(X, y))
#
# ### Lab 4.6.5 K-Nearest Neighbors
# In[20]:
# Fit on training subset with K=1
features = ['Lag1', 'Lag2']
response = 'Direction'
K = 1
X = smarket_train[features]
y = smarket_train[response]
knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(X, y)
# Prediction on test data subset
X = smarket_test[features]
y = smarket_test[response]
y_pred = knn.predict(X)
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], knn.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], knn.classes_])
cfmat = confusion_matrix(y, y_pred, labels=knn.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', knn.score(X, y))
# In[21]:
# Fit on training subset with K=3
features = ['Lag1', 'Lag2']
response = 'Direction'
K = 3
X = smarket_train[features]
y = smarket_train[response]
knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(X, y)
# Prediction on test data subset
X = smarket_test[features]
y = smarket_test[response]
y_pred = knn.predict(X)
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], knn.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], knn.classes_])
cfmat = confusion_matrix(y, y_pred, labels=knn.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', knn.score(X, y))
#
# ### Lab 4.6.6 An Application to Caravan Insurance Data
# In[22]:
# Import Caravan data from R package ISLR
islr = importr('ISLR')
caravan_rdf = rdata(islr).fetch('Caravan')['Caravan']
caravan = pandas2ri.ri2py(caravan_rdf)
# In[23]:
display(caravan.head(10))
display(caravan['Purchase'].value_counts())
# In[24]:
# Scaling, train-test split, and building design matrcies
features = caravan.columns.drop('Purchase')
response = 'Purchase'
X_scaled = scale(caravan[features])
X_train = X_scaled[1000:]
y_train = caravan[response][1000:]
X_test = X_scaled[:1000]
y_test = caravan[response][:1000]
print("Training features shape: ", X_train.shape)
print("Test features shape: ", X_test.shape)
# In[25]:
# KNN fit on training set with K=1,3,5 and predict on test set
for K in (1, 3, 5):
print("\n======================\nK = {}:".format(K))
knn = KNeighborsClassifier(n_neighbors=K)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], knn.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], knn.classes_])
cfmat = confusion_matrix(y_test, y_pred, labels=knn.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nScore: ', knn.score(X_test, y_test))
print('\nClassification Report:')
print(classification_report(y_test, y_pred, digits=3))
# In[26]:
# Logistic regression on training set and predict on test set
logreg = LogisticRegression(C=1e9) # Large C to disable regularization
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
# Evaluate accuracy
cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_])
cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_])
cfmat = confusion_matrix(y_test, y_pred, labels=logreg.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nClassification Report:')
print(classification_report(y_test, y_pred, digits=3))
# In[27]:
# Generate predicts with decision threshold = 0.25
posterior = logreg.predict_proba(X_test)
Yes_idx = np.where(logreg.classes_ == 'Yes')[0][0]
y_pred = pd.Series(posterior[:, Yes_idx] > 0.25)
y_pred.replace([True, False], ['Yes', 'No'], inplace=True)
# Evaluate accuracy
cfmat = confusion_matrix(y_test, y_pred, labels=logreg.classes_)
print("\nConfusion Matrix: ")
display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index))
print('\nClassification Report:')
print(classification_report(y_test, y_pred, digits=3))