#!/usr/bin/env python
# coding: utf-8

# In[80]:


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import sklearn.metrics as metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier


# ## 4.6.1 The Stock Market Data

# In[160]:


df = pd.read_csv('smarket.csv')
smarket_dat = df.drop(df.columns[0], axis=1)
smarket_dat['Direction'] = smarket_dat['Direction'].map({'Up': 1, 'Down': 0})
smarket_dat.head()


# In[161]:


smarket_dat.describe()


# In[162]:


_, ax = plt.subplots(figsize=(10,10))
sns.heatmap(smarket_dat.corr(), annot=True, ax=ax)


# In[163]:


_, ax = plt.subplots(figsize=(10,10))
sns.regplot(ax=ax, x='Year', y='Volume', data=smarket_dat)


# In[164]:


fig, ax = plt.subplots(figsize=(10,10))
sns.regplot(ax=ax, x='index', y='Volume', data=smarket_dat.reset_index())


# ## 4.6.2 Logistic Regression

# In[165]:


logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=smarket_dat).fit()
logit_model.summary()


# In[166]:


pred = logit_model.predict(smarket_dat).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(smarket_dat['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T


# In[109]:


(145 + 507) / (145 + 141 + 457 + 507)


# In[113]:


(pred == smarket_dat['Direction']).sum() / smarket_dat.shape[0]


# In[116]:


(pred == smarket_dat['Direction']).mean()


# In[167]:


year_mask = smarket_dat['Year'] < 2005
pre = smarket_dat[year_mask]
post = smarket_dat[year_mask == False]

logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume', data=pre).fit()
pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(post['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T


# In[148]:


(pred == post['Direction']).mean()


# In[168]:


year_mask = smarket_dat['Year'] < 2005
pre = smarket_dat[year_mask]
post = smarket_dat[year_mask == False]

logit_model = smf.logit(formula='Direction ~ Lag1 + Lag2', data=pre).fit()
pred = logit_model.predict(post).map(lambda x: 1 if x > 0.5 else 0)
conf_mtrx = metrics.confusion_matrix(post['Direction'], pred)
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index= ['Down', 'Up']).T


# In[169]:


(pred == post['Direction']).mean()


# In[170]:


logit_model.predict(pd.DataFrame({'Lag1' : [1.2, 1.5], 'Lag2' : [1.1, -0.8]}))


# ## 4.6.3 Linear Discriminant Analysis

# In[373]:


lda = LinearDiscriminantAnalysis()
lda.fit(pre[['Lag1', 'Lag2']], pre['Direction'])

# Group means
pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up'])


# In[207]:


# Priors
pd.DataFrame(lda.priors_, index= ['Down', 'Up'], columns=['Prior'])


# In[217]:


# Coefficients - NB: sklearn calls these `scalings`
pd.DataFrame(lda.scalings_, columns=['LD1'], index=['Lag1', 'Lag2'])


# In[374]:


LD1 = pre[['Lag1', 'Lag2']] @ lda.scalings_
LD1.columns = ['LD1']
LD1['Direction'] = pre['Direction']
LD1['c'] = 0
_, ax = plt.subplots(figsize=(10,5))
sns.scatterplot('LD1', y='c', data=LD1, hue='Direction', ax=ax)


# In[380]:


# Prediction accuracy and confusion matrix
preds = lda.predict(post[['Lag1', 'Lag2']])
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
print((preds == post['Direction']).mean())
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])


# In[382]:


# Get class probabilities
lda.predict_proba(post[['Lag1', 'Lag2']])


# In[370]:


# BONUS: Simulation in 1D with good separation

class_1 = pd.DataFrame({'class': 0,
                        'x1': np.random.normal(1, 2, 100),
                        'x2' : np.random.normal(2, 2, 100),
                        'x3' : np.random.normal(3, 2, 100)})
class_2 = pd.DataFrame({'class' : 1,
                        'x1': np.random.normal(5, 2, 100),
                        'x2' : np.random.normal(10, 2, 100),
                        'x3' : np.random.normal(15, 2, 100)})

df = pd.concat([class_1, class_2]).sample(frac=1)
lda = LinearDiscriminantAnalysis()
lda.fit(df[['x1', 'x2', 'x3']], df['class'])

LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_
LDs.columns = ['LD1']
LDs['c'] = 0
LDs['class'] = df['class']
_, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(x='LD1', y='c', data=LDs, hue='class', ax=ax)


# In[371]:


# BONUS: Simulation in 2D with good separation

class_1 = pd.DataFrame({'class': 0,
                        'x1': np.random.normal(1, 2, 100),
                        'x2' : np.random.normal(2, 2, 100),
                        'x3' : np.random.normal(3, 2, 100)})
class_2 = pd.DataFrame({'class' : 1,
                        'x1': np.random.normal(5, 2, 100),
                        'x2' : np.random.normal(10, 2, 100),
                        'x3' : np.random.normal(15, 2, 100)})
class_3 = pd.DataFrame({'class' : 2,
                        'x1': np.random.normal(10, 2, 100),
                        'x2' : np.random.normal(20, 2, 100),
                        'x3' : np.random.normal(30, 2, 100)})
df = pd.concat([class_1, class_2, class_3]).sample(frac=1)

lda = LinearDiscriminantAnalysis()
lda.fit(df[['x1', 'x2', 'x3']], df['class'])

LDs = df[['x1', 'x2', 'x3']] @ lda.scalings_
LDs.columns = ['LD1', 'LD2']
LDs['class'] = df['class']
_, ax = plt.subplots(figsize=(10,10))
sns.scatterplot(x='LD1', y='LD2', data=LDs, hue='class', ax=ax)


# ## 4.6.4 Quadratic Discriminant Analysis

# In[386]:


qda = QuadraticDiscriminantAnalysis()
qda.fit(pre[['Lag1', 'Lag2']], pre['Direction'])

# Group means
pd.DataFrame(lda.means_, columns=['Lag1', 'Lag2'], index=['Down', 'Up'])


# In[390]:


preds = qda.predict(post[['Lag1', 'Lag2']])
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
print((preds == post['Direction']).mean())
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])


# ## 4.6.5 K-Nearest Neighbours

# In[401]:


knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
preds = knn.predict(post[['Lag1', 'Lag2']])
print((preds == post['Direction']).mean())
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])


# In[403]:


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(pre[['Lag1', 'Lag2']], pre['Direction'])
preds = knn.predict(post[['Lag1', 'Lag2']])
print((preds == post['Direction']).mean())
conf_mtrx = metrics.confusion_matrix(preds, post['Direction'])
pd.DataFrame(conf_mtrx, columns=['Down', 'Up'], index=['Down', 'Up'])


# ## 4.6.6 An Application to Caravan Insurance Data

# In[10]:


caravan_dat = pd.read_csv('caravan.csv')
caravan_dat = caravan_dat.drop(caravan_dat.columns[0], axis=1)
caravan_dat.head()


# In[17]:


caravan_dat.describe()


# In[18]:


tmp = caravan_dat.drop('Purchase', axis=1)
caravan_dat_std = (tmp - tmp.mean()) / tmp.std()
caravan_dat_std['Purchase'] = caravan_dat['Purchase'].map({'Yes' : 1, 'No' : 0})
caravan_dat_std.head()


# In[38]:


test = caravan_dat_std.iloc[0:1000]
train = caravan_dat_std.iloc[1000:]

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(train.drop('Purchase', axis=1), train['Purchase'])
preds = knn.predict(test.drop('Purchase', axis=1))

# knn prediction accuracy with k=1
(preds == test['Purchase']).mean()


# In[40]:


conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])


# In[43]:


# naive prediction accuracy
(test['Purchase'] == 0).mean()


# In[44]:


# knn positive predictive value with k = 1
9 / (68 + 9)


# In[51]:


knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train.drop('Purchase', axis=1), train['Purchase'])
preds = knn.predict(test.drop('Purchase', axis=1))
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])


# In[53]:


# knn postivie predictive value with k = 5
4 / (11 + 4)


# In[77]:


formula = 'Purchase ~ ' + ' + '.join(train.drop('Purchase', axis=1).columns)
logit_model = smf.logit(formula= formula, data=train).fit()

preds = (logit_model.predict(test) > 0.5)
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])

# positive predictive value is zero!


# In[79]:


preds = (logit_model.predict(test) > 0.25)
conf_mtrx = metrics.confusion_matrix(preds, test['Purchase'])
pd.DataFrame(conf_mtrx, columns=['No', 'Yes'], index=['No', 'Yes'])

# positive predictive value is (11 / 33) = 1/3