#!/usr/bin/env python # coding: utf-8 # In[80]: import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import statsmodels.formula.api as smf import sklearn.metrics as metrics from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier # ## 4.6.1 The Stock Market Data # In[160]: df = pd.read_csv('smarket.csv') smarket_dat = df.drop(df.columns[0], axis=1) smarket_dat['Direction'] = smarket_dat['Direction'].map({'Up': 1, 'Down': 0}) smarket_dat.head() # In[161]: smarket_dat.describe() # In[162]: _, ax = plt.subplots(figsize=(10,10)) sns.heatmap(smarket_dat.corr(), annot=True, ax=ax) # In[163]: _, ax = plt.subplots(figsize=(10,10)) sns.regplot(ax=ax, x='Year', y='Volume', data=smarket_dat) # In[164]: fig, ax = plt.subplots(figsize=(10,10)) sns.regplot(ax=ax, x='index', y='Volume', data=smarket_dat.reset_index()) # ## 4.6.2 Logistic Regression # In[165]: logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=smarket_dat).fit() logit_model.summary() # In[166]: pred = logit_model.predict(smarket_dat).map(lambda x: 1 if x > 0.5 else 0) conf_mtrx = metrics.confusion_matrix(smarket_dat['Direction'], pred) pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T # In[109]: (145 + 507) / (145 + 141 + 457 + 507) # In[113]: (pred == smarket_dat['Direction']).sum() / smarket_dat.shape[0] # In[116]: (pred == smarket_dat['Direction']).mean() # In[167]: year_mask = smarket_dat['Year'] < 2005 pre = smarket_dat[year_mask] post = smarket_dat[year_mask == False] logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=pre).fit() pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0) conf_mtrx = metrics.confusion_matrix(post['Direction'], pred) pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T # In[148]: (pred == post['Direction']).mean() # In[168]: year_mask = smarket_dat['Year'] < 2005 pre = smarket_dat[year_mask] post = smarket_dat[year_mask == False] logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2', data=pre).fit() pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0) conf_mtrx = metrics.confusion_matrix(post['Direction'], pred) pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T # In[169]: (pred == post['Direction']).mean() # In[170]: logit_model.predict(pd.DataFrame({'Lag1' : [1.2, 1.5], 'Lag2' : [1.1, -0.8]})) # ## 4.6.3 Linear Discriminant Analysis # In[373]: lda = LinearDiscriminantAnalysis() lda.fit(pre[['Lag1', 'Lag2']], pre['Direction']) # Group means pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up']) # In[207]: # Priors pd.DataFrame(lda.priors_, index= ['Down', 'Up'], columns=['Prior']) # In[217]: # Coefficients - NB: sklearn calls these `scalings` pd.DataFrame(lda.scalings_, columns=['LD1'], index=['Lag1', 'Lag2']) # In[374]: LD1 = pre[['Lag1', 'Lag2']] @ lda.scalings_ LD1.columns = ['LD1'] LD1['Direction'] = pre['Direction'] LD1['c'] = 0 _, ax = plt.subplots(figsize=(10,5)) sns.scatterplot('LD1', y='c', data=LD1, hue='Direction', ax=ax) # In[380]: # Prediction accuracy and confusion matrix preds = lda.predict(post[['Lag1', 'Lag2']]) conf_mtrx = metrics.confusion_matrix(preds, post['Direction']) print((preds == post['Direction']).mean()) pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up']) # In[382]: # Get class probabilities lda.predict_proba(post[['Lag1', 'Lag2']]) # In[370]: # BONUS: Simulation in 1D with good separation class_1 = pd.DataFrame({'class': 0, 'x1': np.random.normal(1, 2, 100), 'x2' : np.random.normal(2, 2, 100), 'x3' : np.random.normal(3, 2, 100)}) class_2 = pd.DataFrame({'class' : 1, 'x1': np.random.normal(5, 2, 100), 'x2' : np.random.normal(10, 2, 100), 'x3' : np.random.normal(15, 2, 100)}) df = pd.concat([class_1, class_2]).sample(frac=1) lda = LinearDiscriminantAnalysis() lda.fit(df[['x1', 'x2', 'x3']], df['class']) LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_ LDs.columns = ['LD1'] LDs['c'] = 0 LDs['class'] = df['class'] _, ax = plt.subplots(figsize=(10,5)) sns.scatterplot(x='LD1', y='c', data=LDs, hue='class', ax=ax) # In[371]: # BONUS: Simulation in 2D with good separation class_1 = pd.DataFrame({'class': 0, 'x1': np.random.normal(1, 2, 100), 'x2' : np.random.normal(2, 2, 100), 'x3' : np.random.normal(3, 2, 100)}) class_2 = pd.DataFrame({'class' : 1, 'x1': np.random.normal(5, 2, 100), 'x2' : np.random.normal(10, 2, 100), 'x3' : np.random.normal(15, 2, 100)}) class_3 = pd.DataFrame({'class' : 2, 'x1': np.random.normal(10, 2, 100), 'x2' : np.random.normal(20, 2, 100), 'x3' : np.random.normal(30, 2, 100)}) df = pd.concat([class_1, class_2, class_3]).sample(frac=1) lda = LinearDiscriminantAnalysis() lda.fit(df[['x1', 'x2', 'x3']], df['class']) LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_ LDs.columns = ['LD1', 'LD2'] LDs['class'] = df['class'] _, ax = plt.subplots(figsize=(10,10)) sns.scatterplot(x='LD1', y='LD2', data=LDs, hue='class', ax=ax) # ## 4.6.4 Quadratic Discriminant Analysis # In[386]: qda = QuadraticDiscriminantAnalysis() qda.fit(pre[['Lag1', 'Lag2']], pre['Direction']) # Group means pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up']) # In[390]: preds = qda.predict(post[['Lag1', 'Lag2']]) conf_mtrx = metrics.confusion_matrix(preds, post['Direction']) print((preds == post['Direction']).mean()) pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up']) # ## 4.6.5 K-Nearest Neighbours # In[401]: knn = KNeighborsClassifier(n_neighbors=1) knn.fit(pre[['Lag1', 'Lag2']], pre['Direction']) preds = knn.predict(post[['Lag1', 'Lag2']]) print((preds == post['Direction']).mean()) conf_mtrx = metrics.confusion_matrix(preds, post['Direction']) pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up']) # In[403]: knn = KNeighborsClassifier(n_neighbors=3) knn.fit(pre[['Lag1', 'Lag2']], pre['Direction']) preds = knn.predict(post[['Lag1', 'Lag2']]) print((preds == post['Direction']).mean()) conf_mtrx = metrics.confusion_matrix(preds, post['Direction']) pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up']) # ## 4.6.6 An Application to Caravan Insurance Data # In[10]: caravan_dat = pd.read_csv('caravan.csv') caravan_dat = caravan_dat.drop(caravan_dat.columns[0], axis=1) caravan_dat.head() # In[17]: caravan_dat.describe() # In[18]: tmp = caravan_dat.drop('Purchase', axis=1) caravan_dat_std = (tmp - tmp.mean()) / tmp.std() caravan_dat_std['Purchase'] = caravan_dat['Purchase'].map({'Yes' : 1, 'No' : 0}) caravan_dat_std.head() # In[38]: test = caravan_dat_std.iloc[0:1000] train = caravan_dat_std.iloc[1000:] knn = KNeighborsClassifier(n_neighbors=1) knn.fit(train.drop('Purchase', axis=1), train['Purchase']) preds = knn.predict(test.drop('Purchase', axis=1)) # knn prediction accuracy with k=1 (preds == test['Purchase']).mean() # In[40]: conf_mtrx = metrics.confusion_matrix(preds, test['Purchase']) pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes']) # In[43]: # naive prediction accuracy (test['Purchase'] == 0).mean() # In[44]: # knn positive predictive value with k = 1 9 / (68 + 9) # In[51]: knn = KNeighborsClassifier(n_neighbors=5) knn.fit(train.drop('Purchase', axis=1), train['Purchase']) preds = knn.predict(test.drop('Purchase', axis=1)) conf_mtrx = metrics.confusion_matrix(preds, test['Purchase']) pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes']) # In[53]: # knn postivie predictive value with k = 5 4 / (11 + 4) # In[77]: formula = 'Purchase ~ ' + ' + '.join(train.drop('Purchase', axis=1).columns) logit_model = smf.logit(formula= formula, data=train).fit() preds = (logit_model.predict(test) > 0.5) conf_mtrx = metrics.confusion_matrix(preds, test['Purchase']) pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes']) # positive predictive value is zero! # In[79]: preds = (logit_model.predict(test) > 0.25) conf_mtrx = metrics.confusion_matrix(preds, test['Purchase']) pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes']) # positive predictive value is (11 / 33) = 1/3