#!/usr/bin/env python # coding: utf-8 # # Chapter 4 - Classification # - [Lab 4.6.1 The Stock Market Data](#lab-4.6.1) # - [Lab 4.6.2 Logistic Regression](#lab-4.6.2) # - [Lab 4.6.3 Linear Discriminant Analysis](#lab-4.6.3) # - [Lab 4.6.4 Quadratic Discriminant Analysis](#lab-4.6.4) # - [Lab 4.6.5 K-Nearest Neighbors](#lab-4.6.5) # - [Lab 4.6.6 An Application to Caravan Insurance Data](#lab-4.6.6) # ### Imports and Configurations # In[1]: # Standard imports import warnings # Use rpy2 for loading R datasets from rpy2.robjects.packages import importr from rpy2.robjects.packages import data as rdata from rpy2.robjects import pandas2ri # Math and data processing import numpy as np import scipy as sp import pandas as pd # StatsModels import statsmodels.api as sm import statsmodels.formula.api as smf # scikit-learn from sklearn.linear_model import LogisticRegression from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import scale from sklearn.metrics import confusion_matrix, classification_report # Visulization from IPython.display import display import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') mpl.style.use('ggplot') import statsmodels.graphics.api as smg # # ### Lab 4.6.1 The Stock Market Data # In[2]: # Import Smarket dataset from R package ISLR islr = importr('ISLR') smarket_rdf = rdata(islr).fetch('Smarket')['Smarket'] smarket = pandas2ri.ri2py(smarket_rdf) # In[3]: # Display dataset structures and statistics display(smarket.head()) display(smarket.info()) display(smarket.describe()) # In[4]: # Correlation matrix display(smarket.corr()) # In[5]: # Plot Smarket volumes ax = smarket.plot('Year', 'Volume', kind='bar', figsize=(15,6), color='k') # Remove redundant xtick labels xticklabels = ['',] * smarket.shape[0] xtext, xlocs = np.unique(smarket.Year, return_index=True) for t, i in zip(xtext, xlocs): xticklabels[i] = str(int(t)) ax.set_xticklabels(xticklabels, rotation=0) ax.set_ylabel('Volume') ax.set_title('S&P 500 volume over 1250 days.') plt.show() # # ### Lab 4.6.2 Logistic Regression # ##### StatsModels # In[6]: # Logistic regression by GLM formula = 'Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume' smarket_glm = smf.glm(formula, data=smarket, family=sm.families.Binomial()).fit() print('Deviance Residuals:') display(smarket_glm.resid_deviance.describe()) print(smarket_glm.summary()) print('\n Null deviance: {0:.1f} on {1} degrees of freedom'.format(smarket_glm.null_deviance, smarket_glm.df_model+smarket_glm.df_resid)) print('Residual deviance: {0:.1f} on {1} degrees of freedom'.format(smarket_glm.deviance, smarket_glm.df_resid)) print('AIC: {0:.2f}'.format(smarket_glm.aic)) # ##### scikit-learn LogisticRegression # In[7]: # Specify features and response features = ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume'] response = 'Direction' # Fit on the whole dataset X = smarket[features] y = smarket[response] logreg = LogisticRegression(C=1e9) # Use a large C to disable regularization logreg.fit(X, y) # Extract coefficients from fitting results coef = pd.DataFrame(logreg.coef_, columns=features) coef.insert(loc=0, column='(Intercept)', value=logreg.intercept_) coef.index=[''] display(coef) # In[8]: # In-sample prediction with type='response', or P(Y=1|X) print("First ten in-sample prediction probabilities P(Y=1|X): ") display(logreg.predict_proba(X)[0:10, 1]) # In-sample prediction with decisions y_pred = logreg.predict(X) print("In-sample prediction decision results: ") display(y_pred[0:10]) # In[9]: # Evaluate accruacy by confusion matrix and score cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_]) cfmat = confusion_matrix(y, y_pred, labels=logreg.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', logreg.score(X, y)) # In[10]: # Manual train-test split smarket_train = smarket[smarket.Year!=2005] smarket_test = smarket[smarket.Year==2005] print("Training dataset shape: ", smarket_train.shape) print("Test dataset shape: ", smarket_test.shape) # In[11]: # Specify features and response features = ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume'] response = 'Direction' # Fit on training data subset X = smarket_train[features] y = smarket_train[response] logreg = LogisticRegression(C=1e9) # Use a large C to disable regularization logreg.fit(X, y) # Prediction on test data subset X = smarket_test[features] y = smarket_test[response] y_pred = logreg.predict(X) # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_]) cfmat = confusion_matrix(y, y_pred, labels=logreg.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', logreg.score(X, y)) # In[12]: # Specify features and response features = ['Lag1', 'Lag2'] response = 'Direction' # Improve prediction by removing features with large p-values X = smarket_train[features] y = smarket_train[response] logreg = LogisticRegression(C=1e9) # Use a large C to disable regularization logreg.fit(X, y) # Prediction on test data subset X = smarket_test[features] y = smarket_test[response] y_pred = logreg.predict(X) # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_]) cfmat = confusion_matrix(y, y_pred, labels=logreg.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', logreg.score(X, y)) # # ### Lab 4.6.3 Linear Discriminant Analysis # In[13]: # Specify features and response features = ['Lag1', 'Lag2'] response = 'Direction' # Fit on training subset X = smarket_train[features] y = smarket_train[response] lda = LinearDiscriminantAnalysis() lda.fit(X, y) # Priors, group means, and coefficients of linear discriminants priors = pd.DataFrame(lda.priors_, index=lda.classes_, columns=['']).T print("Prior probabilities of groups:") display(priors) gmeans = pd.DataFrame(lda.means_, index=lda.classes_, columns=features) print("\nGroup means:") display(gmeans) coef = pd.DataFrame(lda.scalings_, columns=['LD1'], index=features) print("\nCoefficients of linear discriminants:") display(coef) # In[14]: # Plot linear discriminants of the LDA training fit smarket_grouped = smarket_train.groupby('Direction') smarket_down = smarket_grouped.get_group('Down') smarket_up = smarket_grouped.get_group('Up') discrim_down = lda.transform(smarket_down[['Lag1', 'Lag2']]) discrim_up = lda.transform(smarket_up[['Lag1', 'Lag2']]) plt.figure(figsize=(12,6)) plt.subplot(2, 1, 1) plt.hist(discrim_down, 16, normed=True, color='c') plt.title('group Down') plt.xlim(-5, 5) plt.subplot(2, 1, 2) plt.hist(discrim_up, 16, normed=True, color='c') plt.title('group Up') plt.xlim(-5, 5) plt.show() # In[15]: # Prediction on test subset X = smarket_test[['Lag1', 'Lag2']] y = smarket_test['Direction'] y_pred = lda.predict(X) # Generate posterior probability matrix posterior = pd.DataFrame(lda.predict_proba(X), columns=lda.classes_) # Generate linear discriminants on the test subset discrim_test = lda.transform(X) # In[16]: # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], lda.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], lda.classes_]) cfmat = confusion_matrix(y, y_pred, labels=lda.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', lda.score(X, y)) # In[17]: # First 20 posterior probabilities print('Posterior probabilities:') print(posterior.iloc[:20]) # First 20 prediction results print('\nFirst 20 prediction results: \n', y_pred[:20]) # Number of Down class with threshold changed to 0.9 print('\nNumber of Down class with threshold = 0.9: ', sum(posterior['Down'] > 0.9)) # # ### Lab 4.6.4 Quadratic Discriminant Analysis # In[18]: # Specify features and response features = ['Lag1', 'Lag2'] response = 'Direction' # Fit on training subset X = smarket_train[features] y = smarket_train[response] qda = QuadraticDiscriminantAnalysis() qda.fit(X, y) # Priors, group means, and coefficients of linear discriminants priors = pd.DataFrame(qda.priors_, index=qda.classes_, columns=['']).T print("Prior probabilities of groups:") display(priors) gmeans = pd.DataFrame(qda.means_, index=qda.classes_, columns=features) print("\nGroup means:") display(gmeans) coef = pd.DataFrame(qda.scalings_, columns=['QD1', 'QD2'], index=features) print("\nCoefficients of quadratic discriminants:") display(coef) # In[19]: # Prediction on test data subset X = smarket_test[features] y = smarket_test[response] y_pred = qda.predict(X) # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], qda.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], qda.classes_]) cfmat = confusion_matrix(y, y_pred, labels=qda.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', qda.score(X, y)) # # ### Lab 4.6.5 K-Nearest Neighbors # In[20]: # Fit on training subset with K=1 features = ['Lag1', 'Lag2'] response = 'Direction' K = 1 X = smarket_train[features] y = smarket_train[response] knn = KNeighborsClassifier(n_neighbors=K) knn.fit(X, y) # Prediction on test data subset X = smarket_test[features] y = smarket_test[response] y_pred = knn.predict(X) # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], knn.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], knn.classes_]) cfmat = confusion_matrix(y, y_pred, labels=knn.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', knn.score(X, y)) # In[21]: # Fit on training subset with K=3 features = ['Lag1', 'Lag2'] response = 'Direction' K = 3 X = smarket_train[features] y = smarket_train[response] knn = KNeighborsClassifier(n_neighbors=K) knn.fit(X, y) # Prediction on test data subset X = smarket_test[features] y = smarket_test[response] y_pred = knn.predict(X) # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], knn.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], knn.classes_]) cfmat = confusion_matrix(y, y_pred, labels=knn.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', knn.score(X, y)) # # ### Lab 4.6.6 An Application to Caravan Insurance Data # In[22]: # Import Caravan data from R package ISLR islr = importr('ISLR') caravan_rdf = rdata(islr).fetch('Caravan')['Caravan'] caravan = pandas2ri.ri2py(caravan_rdf) # In[23]: display(caravan.head(10)) display(caravan['Purchase'].value_counts()) # In[24]: # Scaling, train-test split, and building design matrcies features = caravan.columns.drop('Purchase') response = 'Purchase' X_scaled = scale(caravan[features]) X_train = X_scaled[1000:] y_train = caravan[response][1000:] X_test = X_scaled[:1000] y_test = caravan[response][:1000] print("Training features shape: ", X_train.shape) print("Test features shape: ", X_test.shape) # In[25]: # KNN fit on training set with K=1,3,5 and predict on test set for K in (1, 3, 5): print("\n======================\nK = {}:".format(K)) knn = KNeighborsClassifier(n_neighbors=K) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], knn.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], knn.classes_]) cfmat = confusion_matrix(y_test, y_pred, labels=knn.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nScore: ', knn.score(X_test, y_test)) print('\nClassification Report:') print(classification_report(y_test, y_pred, digits=3)) # In[26]: # Logistic regression on training set and predict on test set logreg = LogisticRegression(C=1e9) # Large C to disable regularization logreg.fit(X_train, y_train) y_pred = logreg.predict(X_test) # Evaluate accuracy cfmat_cnames = pd.MultiIndex.from_product([['Predict'], logreg.classes_]) cfmat_index = pd.MultiIndex.from_product([['True'], logreg.classes_]) cfmat = confusion_matrix(y_test, y_pred, labels=logreg.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nClassification Report:') print(classification_report(y_test, y_pred, digits=3)) # In[27]: # Generate predicts with decision threshold = 0.25 posterior = logreg.predict_proba(X_test) Yes_idx = np.where(logreg.classes_ == 'Yes')[0][0] y_pred = pd.Series(posterior[:, Yes_idx] > 0.25) y_pred.replace([True, False], ['Yes', 'No'], inplace=True) # Evaluate accuracy cfmat = confusion_matrix(y_test, y_pred, labels=logreg.classes_) print("\nConfusion Matrix: ") display(pd.DataFrame(cfmat, columns=cfmat_cnames, index=cfmat_index)) print('\nClassification Report:') print(classification_report(y_test, y_pred, digits=3))