click here to view notebook on nbviewer.
Import all modules required for this notebook. This is the only place where all modules are imported.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px
import pickle
from os import path
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from matplotlib.patches import Rectangle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import det_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix
from mlxtend.evaluate import confusion_matrix
This code segment aligns all plots at the center.
from IPython.core.display import HTML
HTML("""
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
""")
# Function to visualize accuracy, f1, precision, recall obtained from grid search.
def viz_hyperparameter(res,X_axis,xlabel,scoring,xlim,ylim,plot_title):
plt.figure(figsize=(10, 8))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously"+plot_title, fontsize=12)
plt.xlabel(xlabel)
plt.ylabel("Score")
ax = plt.gca()
ax.set_xlim(xlim[0], xlim[1])
ax.set_ylim(ylim[0], ylim[1])
for scorer, color in zip(sorted(scoring), ['g', 'k','b','r','c']):
sample = 'test'
style = '-'
sample_score_mean = res['mean_%s_%s' % (sample, scorer)]
sample_score_std = res['std_%s_%s' % (sample, scorer)]
ax.fill_between(X_axis, sample_score_mean - sample_score_std,
sample_score_mean + sample_score_std,
alpha=0.1, color=color)
ax.plot(X_axis, sample_score_mean, style, color=color,
alpha=0.5,label="%s (%s)" % (scorer, sample))
best_index = np.nonzero(res['rank_test_%s' % scorer] == min(res['rank_test_%s' % scorer]))[0][0]
best_score = res['mean_test_%s' % scorer][best_index]
#Plot a dotted vertical line at the best score for that scorer marked by x
ax.plot([X_axis[best_index], ] * 2, [0, best_score],
linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)
# Annotate the best score for that scorer
ax.annotate("%0.2f" % best_score, (X_axis[best_index], best_score + 0.005))
plt.legend(loc="best")
plt.grid(False)
plt.show()
# Function definition to convert probability of high occurance to binary class labels '0' or '1', based on an input thrshold
def to_labels(pos_probs, threshold):
return (pos_probs >= threshold).astype('int')
# Plots Classification Metrics (Accuracy, False Negative Rate, False Positive Rate) and Classification $ Cost impact
# vs classification threshold
# Inputs are y_test, y_probability_score_for_high,threshold_interest
# Threshold_interest is probability threshold of interest for most optimal model performance, given trade-offs between
# model $cost and classification erros. It is manual entry for visual display in the plot.
def plot_metrics_cost_vs_threshold(y_test_ref,y_prob_high_score,threshold_interest,cost_matrix):
knn_fpr, knn_fnr, knn_thresholds = det_curve(y_test_ref, y_prob_high_score)
score = [confusion_matrix(y_target=y_test, y_predicted=to_labels(y_prob_high_score, t)) for t in knn_thresholds]
cost_list =[]
acc_list = []
for nscore in score:
fp = nscore[0][1]
fn = nscore[1][0]
tp = nscore[1][1]
tn = nscore[0][0]
total = nscore.sum()
acc = (tp+tn)/total
cost_score = np.sum(np.multiply(nscore, cost_matrix))
cost_list.append(cost_score)
acc_list.append(acc)
xloc, yloc = threshold_interest, -0.1
fig, ax1 = plt.subplots()
color = 'k'
ax1.set_xlabel('Threshold')
ax1.set_ylabel('Metrics', color=color)
ax1.plot(knn_thresholds, knn_fpr, '-.',color= color, label='fpr',linewidth=3)
ax1.plot(knn_thresholds, knn_fnr, '--o',color= color, label='fnr',linewidth=3)
ax1.plot(knn_thresholds, acc_list,'--o',color= 'g', label='acc',linewidth=3)
ax1.add_patch(Rectangle((xloc, yloc),0.1, 1.1, facecolor="yellow", alpha = 0.1))
ax1.tick_params(axis='y', labelcolor=color)
ax1.legend(loc=0)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:red'
ax2.set_ylabel('$ cost', color=color) # we already handled the x-label with ax1
ax2.plot(knn_thresholds, cost_list, 's:',color= color, label='cost')
ax2.tick_params(axis='y', labelcolor=color)
ax2.legend(loc=0)
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.show()
print('$ Cost and Classification Error Rates for different classification thresholds:')
dfs = pd.DataFrame(list(zip(knn_thresholds, cost_list,knn_fnr,knn_fpr,acc_list)),columns =['Threshold', '$Cost', 'FNr','FPr','Acc'])
print(dfs)
This case study is about demonstrating data science skills to build and evaluate mathematical models for a real-world dataset while minimizing dollar amount cost. Models shall be developed for the given dataset and compared with respect to dollar amount based on appropriate evaluation metric. Evaluation metric shall be determined after exploring features provided in the data and distribution of class labels shall be studied to decide on metric to be used for model evaluation.
A Business client has reached out to us with an unknown dataset with many features and would like us to build a model that balances minimizing cost to the business, with respect to the model being accurate enough to apply to their day-to-day business.
Here is how the decision cost breaks down:
True Positive - $0
True Negative - $0
False Positive - $10
False Negative - $500
Incorrectly classifying the data can get costly as you can see above. Correctly classified data does not cost the client any money. False Positive or classifying the data as true when its false costs the client \$10. False Negative or classifying the data as false when it is true costs the client $500. We will need to factor in the cost of each of the features and their associated values during the model building to minimize the cost. For this analysis, we will compare recall or the true positive rate for each model analyzed. Recall or the true positive rate is the number of positive samples that are correctly classified as ‘positive’. If all of them are identified correctly, then recall will be 1. If all of them were classified incorrectly, then recall will be 0. With some positive samples classified as negative, recall with be in between 0 and 1. The client requires us to provide a detailed cost benefit analysis of the models we are proposing and provide comparative analysis of models if more than one models are proposed.
This section provides insights about datasets as follows
Dataset was downloaded from the SMU cloud. The dataset contains 160K observations, 50 independent variables and 1 target variable. Target variable contains value 0s and 1s only this is going to be binary classification problem.
Summary is as follows:
Further analysis is conducted in next sections.
df = pd.read_csv('data/final_project.csv')
df
x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | ... | x41 | x42 | x43 | x44 | x45 | x46 | x47 | x48 | x49 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.166563 | -3.961588 | 4.621113 | 2.481908 | -1.800135 | 0.804684 | 6.718751 | -14.789997 | -1.040673 | -4.204950 | ... | -1.497117 | 5.414063 | -2.325655 | 1.674827 | -0.264332 | 60.781427 | -7.689696 | 0.151589 | -8.040166 | 0 |
1 | -0.149894 | -0.585676 | 27.839856 | 4.152333 | 6.426802 | -2.426943 | 40.477058 | -6.725709 | 0.896421 | 0.330165 | ... | 36.292790 | 4.490915 | 0.762561 | 6.526662 | 1.007927 | 15.805696 | -4.896678 | -0.320283 | 16.719974 | 0 |
2 | -0.321707 | -1.429819 | 12.251561 | 6.586874 | -5.304647 | -11.311090 | 17.812850 | 11.060572 | 5.325880 | -2.632984 | ... | -0.368491 | 9.088864 | -0.689886 | -2.731118 | 0.754200 | 30.856417 | -7.428573 | -2.090804 | -7.869421 | 0 |
3 | -0.245594 | 5.076677 | -24.149632 | 3.637307 | 6.505811 | 2.290224 | -35.111751 | -18.913592 | -0.337041 | -5.568076 | ... | 15.691546 | -7.467775 | 2.940789 | -6.424112 | 0.419776 | -72.424569 | 5.361375 | 1.806070 | -7.670847 | 0 |
4 | -0.273366 | 0.306326 | -11.352593 | 1.676758 | 2.928441 | -0.616824 | -16.505817 | 27.532281 | 1.199715 | -4.309105 | ... | -13.911297 | -5.229937 | 1.783928 | 3.957801 | -0.096988 | -14.085435 | -0.208351 | -0.894942 | 15.724742 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
159995 | -0.487024 | -4.270269 | 0.417395 | -1.992423 | 1.757552 | -1.167819 | 0.606860 | 41.084463 | -1.923188 | -2.374213 | ... | -9.390451 | 8.096802 | -0.875131 | -1.413787 | -0.363968 | 15.339392 | 4.364205 | -3.831489 | 28.389858 | 1 |
159996 | 0.825477 | 4.804368 | 22.161535 | 11.371303 | 1.715901 | 6.990759 | 32.221207 | -12.278038 | -3.861086 | 6.715126 | ... | 12.803189 | 0.841446 | -0.682177 | -5.047677 | -0.017898 | 0.780130 | 6.387266 | -1.374742 | -1.623952 | 0 |
159997 | -0.802489 | 5.362696 | 7.243419 | -7.496074 | 2.295250 | -2.756067 | 10.531388 | 42.515821 | 1.420984 | 6.788916 | ... | -0.346570 | -0.144098 | 0.738298 | 7.241041 | 0.215347 | -12.155249 | 3.265263 | 1.230963 | 3.335471 | 1 |
159998 | 0.339237 | 7.609895 | 5.368414 | -2.825481 | 4.046102 | 15.322603 | 7.805271 | -10.233054 | 2.609986 | 4.251127 | ... | -0.307656 | -0.601145 | -3.443112 | 0.549931 | 0.206728 | 5.081980 | 1.701462 | -0.279619 | -1.986424 | 0 |
159999 | -0.296748 | -0.412773 | -10.911407 | -5.633629 | -4.028154 | 15.939428 | -15.864365 | -46.388192 | 18.339472 | -4.575499 | ... | 27.837473 | 1.392395 | 0.893555 | -1.848590 | -0.423982 | -17.379380 | 5.916490 | -2.767444 | 15.547557 | 1 |
160000 rows × 51 columns
Information about fields is not available. df.info() provides us following information.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 160000 entries, 0 to 159999 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 x0 159974 non-null float64 1 x1 159975 non-null float64 2 x2 159962 non-null float64 3 x3 159963 non-null float64 4 x4 159974 non-null float64 5 x5 159963 non-null float64 6 x6 159974 non-null float64 7 x7 159973 non-null float64 8 x8 159979 non-null float64 9 x9 159970 non-null float64 10 x10 159957 non-null float64 11 x11 159970 non-null float64 12 x12 159964 non-null float64 13 x13 159969 non-null float64 14 x14 159966 non-null float64 15 x15 159965 non-null float64 16 x16 159974 non-null float64 17 x17 159973 non-null float64 18 x18 159960 non-null float64 19 x19 159965 non-null float64 20 x20 159962 non-null float64 21 x21 159971 non-null float64 22 x22 159973 non-null float64 23 x23 159953 non-null float64 24 x24 159972 non-null object 25 x25 159978 non-null float64 26 x26 159964 non-null float64 27 x27 159970 non-null float64 28 x28 159965 non-null float64 29 x29 159970 non-null object 30 x30 159970 non-null object 31 x31 159961 non-null float64 32 x32 159969 non-null object 33 x33 159959 non-null float64 34 x34 159959 non-null float64 35 x35 159970 non-null float64 36 x36 159973 non-null float64 37 x37 159977 non-null object 38 x38 159969 non-null float64 39 x39 159977 non-null float64 40 x40 159964 non-null float64 41 x41 159960 non-null float64 42 x42 159974 non-null float64 43 x43 159963 non-null float64 44 x44 159960 non-null float64 45 x45 159971 non-null float64 46 x46 159969 non-null float64 47 x47 159963 non-null float64 48 x48 159968 non-null float64 49 x49 159968 non-null float64 50 y 160000 non-null int64 dtypes: float64(45), int64(1), object(5) memory usage: 62.3+ MB
Below is descriptive statisics of all features. It indicates that features most features are on different scale and required normalization before they can be used for the modeling.
df.describe()
x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | ... | x41 | x42 | x43 | x44 | x45 | x46 | x47 | x48 | x49 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 159974.000000 | 159975.000000 | 159962.000000 | 159963.000000 | 159974.000000 | 159963.000000 | 159974.000000 | 159973.000000 | 159979.000000 | 159970.000000 | ... | 159960.000000 | 159974.000000 | 159963.000000 | 159960.000000 | 159971.000000 | 159969.000000 | 159963.000000 | 159968.000000 | 159968.000000 | 160000.000000 |
mean | -0.001028 | 0.001358 | -1.150145 | -0.024637 | -0.000549 | 0.013582 | -1.670670 | -7.692795 | -0.030540 | 0.005462 | ... | 6.701076 | -1.833820 | -0.002091 | -0.006250 | 0.000885 | -12.755395 | 0.028622 | -0.000224 | -0.674224 | 0.401231 |
std | 0.371137 | 6.340632 | 13.273480 | 8.065032 | 6.382293 | 7.670076 | 19.298665 | 30.542264 | 8.901185 | 6.355040 | ... | 18.680196 | 5.110705 | 1.534952 | 4.164595 | 0.396621 | 36.608641 | 4.788157 | 1.935501 | 15.036738 | 0.490149 |
min | -1.592635 | -26.278302 | -59.394048 | -35.476594 | -28.467536 | -33.822988 | -86.354483 | -181.506976 | -37.691045 | -27.980659 | ... | -82.167224 | -27.933750 | -6.876234 | -17.983487 | -1.753221 | -201.826828 | -21.086333 | -8.490155 | -65.791191 | 0.000000 |
25% | -0.251641 | -4.260973 | -10.166536 | -5.454438 | -4.313118 | -5.148130 | -14.780146 | -27.324771 | -6.031058 | -4.260619 | ... | -5.804080 | -5.162869 | -1.039677 | -2.812055 | -0.266518 | -36.428329 | -3.216016 | -1.320800 | -10.931753 | 0.000000 |
50% | -0.002047 | 0.004813 | -1.340932 | -0.031408 | 0.000857 | 0.014118 | -1.948594 | -6.956789 | -0.016840 | 0.006045 | ... | 6.840110 | -1.923754 | -0.004385 | -0.010484 | 0.001645 | -12.982497 | 0.035865 | -0.011993 | -0.574410 | 0.000000 |
75% | 0.248532 | 4.284220 | 7.871676 | 5.445179 | 4.306660 | 5.190749 | 11.446931 | 12.217071 | 5.972349 | 4.305734 | ... | 19.266367 | 1.453507 | 1.033275 | 2.783274 | 0.269049 | 11.445443 | 3.268028 | 1.317703 | 9.651072 | 1.000000 |
max | 1.600849 | 27.988178 | 63.545653 | 38.906025 | 26.247812 | 35.550110 | 92.390605 | 149.150634 | 39.049831 | 27.377842 | ... | 100.050432 | 22.668041 | 6.680922 | 19.069759 | 1.669205 | 150.859415 | 20.836854 | 8.226552 | 66.877604 | 1.000000 |
8 rows × 46 columns
Here are non-numeric features which has content other than numbers. following are the fields.
Basically x24, x29 and x30 are categorical variables in the dataset.All other fields are numeric fields.
df.loc[:,['x24','x29','x30','x32','x37']]
x24 | x29 | x30 | x32 | x37 | |
---|---|---|---|---|---|
0 | euorpe | July | tuesday | 0.0% | $1313.96 |
1 | asia | Aug | wednesday | -0.02% | $1962.78 |
2 | asia | July | wednesday | -0.01% | $430.47 |
3 | asia | July | wednesday | 0.01% | $-2366.29 |
4 | asia | July | tuesday | 0.01% | $-620.66 |
... | ... | ... | ... | ... | ... |
159995 | asia | Aug | wednesday | 0.0% | $-891.96 |
159996 | asia | May | wednesday | -0.01% | $1588.65 |
159997 | asia | Jun | wednesday | -0.0% | $687.46 |
159998 | asia | May | wednesday | -0.02% | $439.21 |
159999 | asia | Aug | tuesday | 0.02% | $-1229.34 |
160000 rows × 5 columns
# Indices of non-numeric features
indices_obj_features = [24,29,30]
# random state 1999
# Indices of numeric features
listoflist = [list(range(24)),list(range(25,29,1)),[31,32],list(range(33,38,1)),list(range(38,50,1))]
indices_num_features = [item for list_id in listoflist for item in list_id]
df['x32'] = df['x32'].astype(str)
df['x32'] = df['x32'].str.replace('%', '')
df['x32'] = df['x32'].astype(float)
df['x37'] = df['x37'].astype(str)
df['x37'] = df['x37'].str.replace('$', '')
df['x37'] = df['x37'].astype(float)
df['y'] = df['y'].astype(object)
df.loc[:,['x24','x29','x30','x32','x37']]
x24 | x29 | x30 | x32 | x37 | |
---|---|---|---|---|---|
0 | euorpe | July | tuesday | 0.00 | 1313.96 |
1 | asia | Aug | wednesday | -0.02 | 1962.78 |
2 | asia | July | wednesday | -0.01 | 430.47 |
3 | asia | July | wednesday | 0.01 | -2366.29 |
4 | asia | July | tuesday | 0.01 | -620.66 |
... | ... | ... | ... | ... | ... |
159995 | asia | Aug | wednesday | 0.00 | -891.96 |
159996 | asia | May | wednesday | -0.01 | 1588.65 |
159997 | asia | Jun | wednesday | -0.00 | 687.46 |
159998 | asia | May | wednesday | -0.02 | 439.21 |
159999 | asia | Aug | tuesday | 0.02 | -1229.34 |
160000 rows × 5 columns
Inspection of missing values begins with plotting missing data. missingno is the python module that enables us to visually inspect missing values as shown below. This is quick check to see if there are any missing values or NAs int the dataset. If dataset contains missing values they need to be analyzed further.
The dataset that we have got has less than 1% of missing values and they don't cause any major change in distribution of data. So we have removed missing values in later section of this notebook shown here step by step.
msno.matrix(df.iloc[:,:-1],figsize=(20, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7fd0427f94a8>
pd.DataFrame((df.iloc[:,:-1].isna().sum()/len(df))*100,columns=['%Missing'])
%Missing | |
---|---|
x0 | 0.016250 |
x1 | 0.015625 |
x2 | 0.023750 |
x3 | 0.023125 |
x4 | 0.016250 |
x5 | 0.023125 |
x6 | 0.016250 |
x7 | 0.016875 |
x8 | 0.013125 |
x9 | 0.018750 |
x10 | 0.026875 |
x11 | 0.018750 |
x12 | 0.022500 |
x13 | 0.019375 |
x14 | 0.021250 |
x15 | 0.021875 |
x16 | 0.016250 |
x17 | 0.016875 |
x18 | 0.025000 |
x19 | 0.021875 |
x20 | 0.023750 |
x21 | 0.018125 |
x22 | 0.016875 |
x23 | 0.029375 |
x24 | 0.017500 |
x25 | 0.013750 |
x26 | 0.022500 |
x27 | 0.018750 |
x28 | 0.021875 |
x29 | 0.018750 |
x30 | 0.018750 |
x31 | 0.024375 |
x32 | 0.019375 |
x33 | 0.025625 |
x34 | 0.025625 |
x35 | 0.018750 |
x36 | 0.016875 |
x37 | 0.014375 |
x38 | 0.019375 |
x39 | 0.014375 |
x40 | 0.022500 |
x41 | 0.025000 |
x42 | 0.016250 |
x43 | 0.023125 |
x44 | 0.025000 |
x45 | 0.018125 |
x46 | 0.019375 |
x47 | 0.023125 |
x48 | 0.020000 |
x49 | 0.020000 |
Since missing values are very nominal in size (less than 0.5%) compared to size of the dataset rows with missing values are removed and final dataset was obtained as below.
print(' ')
print('===========================================================')
print('Delete records with missing values')
print('===========================================================')
print('Original shape of dataframe : ',df.shape)
df_final = df.dropna().copy()
print('After dropping missing values : ',df_final.shape)
print('Total number of records dropped : ',df.shape[0]-df_final.shape[0])
print('Total percentage of records dropped : ',str(round( (df.shape[0]-df_final.shape[0])/df.shape[0],2))+'%')
print(' ')
=========================================================== Delete records with missing values =========================================================== Original shape of dataframe : (160000, 51) After dropping missing values : (158392, 51) Total number of records dropped : 1608 Total percentage of records dropped : 0.01%
Below plot shows there are no missing values left in the df_final Dataframe.
msno.matrix(df_final.iloc[:,:-1],figsize=(20, 5))
<matplotlib.axes._subplots.AxesSubplot at 0x7fcfefc3a4e0>
target = df_final['y'].value_counts(normalize=True).rename('percentage').reset_index()
plt.figure(figsize=(8,5))
sns.barplot(x="index", y="percentage", hue="index", data=target)
<matplotlib.axes._subplots.AxesSubplot at 0x7fcffd9268d0>
Here are observations from the plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
fig.suptitle('Region,Month,Daywise Plots')
sns.countplot(ax=axes[0],x="x24", data=df_final,hue="y")
axes[0].set_title("x24")
axes[0].tick_params(labelrotation=30)
sns.countplot(ax=axes[1],x="x29", data=df_final,hue="y")
axes[1].set_title("x29")
axes[1].tick_params(labelrotation=30)
sns.countplot(ax=axes[2],x="x30", data=df_final,hue="y")
axes[2].set_title("x30")
axes[2].tick_params(labelrotation=30)
Below correleogram shows there are not many correlated features and we will rely on automatic recursive feature elimination technique to reduce number of feature. This approach will keep optimal number of features.
plt.figure(figsize=(20,8))
sns.heatmap(df_final.iloc[:,:-1].corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7fcfcfd71b38>
df_final.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 158392 entries, 0 to 159999 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 x0 158392 non-null float64 1 x1 158392 non-null float64 2 x2 158392 non-null float64 3 x3 158392 non-null float64 4 x4 158392 non-null float64 5 x5 158392 non-null float64 6 x6 158392 non-null float64 7 x7 158392 non-null float64 8 x8 158392 non-null float64 9 x9 158392 non-null float64 10 x10 158392 non-null float64 11 x11 158392 non-null float64 12 x12 158392 non-null float64 13 x13 158392 non-null float64 14 x14 158392 non-null float64 15 x15 158392 non-null float64 16 x16 158392 non-null float64 17 x17 158392 non-null float64 18 x18 158392 non-null float64 19 x19 158392 non-null float64 20 x20 158392 non-null float64 21 x21 158392 non-null float64 22 x22 158392 non-null float64 23 x23 158392 non-null float64 24 x24 158392 non-null object 25 x25 158392 non-null float64 26 x26 158392 non-null float64 27 x27 158392 non-null float64 28 x28 158392 non-null float64 29 x29 158392 non-null object 30 x30 158392 non-null object 31 x31 158392 non-null float64 32 x32 158392 non-null float64 33 x33 158392 non-null float64 34 x34 158392 non-null float64 35 x35 158392 non-null float64 36 x36 158392 non-null float64 37 x37 158392 non-null float64 38 x38 158392 non-null float64 39 x39 158392 non-null float64 40 x40 158392 non-null float64 41 x41 158392 non-null float64 42 x42 158392 non-null float64 43 x43 158392 non-null float64 44 x44 158392 non-null float64 45 x45 158392 non-null float64 46 x46 158392 non-null float64 47 x47 158392 non-null float64 48 x48 158392 non-null float64 49 x49 158392 non-null float64 50 y 158392 non-null object dtypes: float64(47), object(4) memory usage: 67.8+ MB
target_df = df_final.iloc[:,50].astype(int)
X_train, X_test, y_train, y_test = train_test_split( df_final.iloc[:,:-1],
target_df, test_size=0.2, random_state=1999)
X_train_ohe = pd.get_dummies(X_train, prefix=['x24','x29','x30'])
X_test_ohe = pd.get_dummies(X_test, prefix=['x24','x29','x30'])
X_train_ohe.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 126713 entries, 42161 to 82580 Data columns (total 67 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 x0 126713 non-null float64 1 x1 126713 non-null float64 2 x2 126713 non-null float64 3 x3 126713 non-null float64 4 x4 126713 non-null float64 5 x5 126713 non-null float64 6 x6 126713 non-null float64 7 x7 126713 non-null float64 8 x8 126713 non-null float64 9 x9 126713 non-null float64 10 x10 126713 non-null float64 11 x11 126713 non-null float64 12 x12 126713 non-null float64 13 x13 126713 non-null float64 14 x14 126713 non-null float64 15 x15 126713 non-null float64 16 x16 126713 non-null float64 17 x17 126713 non-null float64 18 x18 126713 non-null float64 19 x19 126713 non-null float64 20 x20 126713 non-null float64 21 x21 126713 non-null float64 22 x22 126713 non-null float64 23 x23 126713 non-null float64 24 x25 126713 non-null float64 25 x26 126713 non-null float64 26 x27 126713 non-null float64 27 x28 126713 non-null float64 28 x31 126713 non-null float64 29 x32 126713 non-null float64 30 x33 126713 non-null float64 31 x34 126713 non-null float64 32 x35 126713 non-null float64 33 x36 126713 non-null float64 34 x37 126713 non-null float64 35 x38 126713 non-null float64 36 x39 126713 non-null float64 37 x40 126713 non-null float64 38 x41 126713 non-null float64 39 x42 126713 non-null float64 40 x43 126713 non-null float64 41 x44 126713 non-null float64 42 x45 126713 non-null float64 43 x46 126713 non-null float64 44 x47 126713 non-null float64 45 x48 126713 non-null float64 46 x49 126713 non-null float64 47 x24_america 126713 non-null uint8 48 x24_asia 126713 non-null uint8 49 x24_euorpe 126713 non-null uint8 50 x29_Apr 126713 non-null uint8 51 x29_Aug 126713 non-null uint8 52 x29_Dev 126713 non-null uint8 53 x29_Feb 126713 non-null uint8 54 x29_January 126713 non-null uint8 55 x29_July 126713 non-null uint8 56 x29_Jun 126713 non-null uint8 57 x29_Mar 126713 non-null uint8 58 x29_May 126713 non-null uint8 59 x29_Nov 126713 non-null uint8 60 x29_Oct 126713 non-null uint8 61 x29_sept. 126713 non-null uint8 62 x30_friday 126713 non-null uint8 63 x30_monday 126713 non-null uint8 64 x30_thurday 126713 non-null uint8 65 x30_tuesday 126713 non-null uint8 66 x30_wednesday 126713 non-null uint8 dtypes: float64(47), uint8(20) memory usage: 48.8 MB
sc = StandardScaler()
XS_train_ohe = sc.fit_transform(X_train_ohe)
XS_test_ohe = sc.transform(X_test_ohe)
Store dataframes into csv. These spreadsheets shall be used to run different models on different machines.
#pd.DataFrame(XS_train_ohe,columns=list(X_train_ohe.columns)).to_csv(r'data/train_set.csv',index=False)
#pd.DataFrame(XS_test_ohe ,columns=list(X_test_ohe.columns )).to_csv(r'data/test_set.csv',index=False)
#pd.DataFrame(y_train,columns=['y']).to_csv(r'data/train_target.csv',index=False)
#pd.DataFrame(y_test,columns=['y']).to_csv(r'data/test_target.csv',index=False)
Since we have 67 features in total after one hot encoding here we are checking if Principal component analysis is helpful in reducing features are not. Running PCA we found that at least 47 components which contributes to 90% variance in the data are required which is not significant reduction in the features. So we won't be using feature reduction using PCA. we will rather use recursive feature elimination techqniue for logistic regression.
pca = PCA()
PCA_train = pca.fit_transform(XS_train_ohe)
PCA_test = pca.transform(XS_test_ohe)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
exp_var_cumul
px.area(
x=range(1, exp_var_cumul.shape[0] + 1),
y=exp_var_cumul,
width=800,
height=400,
labels={"x": "# Components", "y": "Explained Variance"})
Choice of metric for model development
The requirement is to minimize the cost function, which penalizes false negatives (FN) 50x more than false positives (FP).
$ cost penalty for FN = $500
$ cost penalty for FP = $10
$ cost penalty for TP, TN = 0
Effectively, above implies that for the same total cost, count of FN = 1/50th of FP. Or one needs to prioritize minimizing FN.
Given above, Recall is the metric of choice to search for best model. Recall definition is given below:
Recall = TP / TP +FN
Ideal model should have very low or zero FN, hence requires high recall score.
Logistic Regression
Sklearn comes with get_param function which returns list of hyperparameters for the given estimators. All it requires is to create object of the estimator.
Following hyperparameter can be tuned for Logistic Regression with different solver (lbfgs,sag,saga,newton-cg,liblinear) and regularization (l1 an l2) :
KNN - K Nearest Neighbors
For KNN n_neighbors is the hyperparameter that can be tuned. Different values of n_neighbor are tried and value that gives best model can be used for predictions.
Random Forest
Random forest is non-parametric classifier. There are many hyperparamters tha can be tuned to build random forest listed as below
These are important hyperparamerters which can be tuned to build models. For our dataset are using only two hyperparameters:
max features is number of features to consider when looking for the best split:
If int, then consider max_features features at each split.
If float, then max_features is a fraction and int(max_features * n_features) features are considered at each split.
If “auto”, then max_features=sqrt(n_features).
If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
If “log2”, then max_features=log2(n_features).
If None, then max_features=n_features. [scikitlearn-doc]
Following are the hyperparameters returned by get_params function of sklearn library.
log_model = LogisticRegression(random_state=1999,solver='lbfgs')
log_model.get_params()
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 1999, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Our dataset has 67 features after one hot encoding and not all features might be important. Recursive feature elimination finds optimal number of features and identifies features that should be kept in the model. Below code runs Recursive feature elimination with cross validation and finds optimal number of features and same shall be used for building model.
Plot below is self-explanatory. It shows how recall/accuracy score varies with number of features increases. After running full cross validation number of optimal features are found to be - 60
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
min_features_to_select= 60
rfecv = RFECV(estimator=log_model, step=1, cv=StratifiedKFold(2),
scoring='recall',
min_features_to_select=min_features_to_select)
rfecv.fit(XS_train_ohe, y_train)
print("Optimal number of features : %d" % rfecv.n_features_)
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(min_features_to_select,
len(rfecv.grid_scores_) + min_features_to_select),
rfecv.grid_scores_)
plt.show()
Optimal number of features : 60
Grid search has been executed for logistic regression with following:
With these variables set Grid search runs for all possible combinations of the grid parameters. Result of grid search is stored in the file as shown in below code.
# Load Grid Search Result here
if(path.exists("data/logistic_final_grid_result.dat")):
grid_result = pickle.load(open("data/logistic_final_grid_result.dat", "rb"))
else:
log_model = LogisticRegression(random_state=1999)
cs07_random_state_ = 1999
#
# Run Grid Search
#
log_param_grid = [
{'penalty' : ['l2'],
'C' : [0.1,0.01,0.001,0.0001,0.5,0.6,10],
'solver' : ['sag','lbfgs'],
'max_iter' : [500],
'random_state' : [cs07_random_state_]} ]
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1999)
print('Running RFE : ')
rfe = RFE(log_model,n_features_to_select=61, step=1)
rfe = rfe.fit(XS_train_ohe, y_train.flatten())
print(rfe.support_)
print(rfe.ranking_)
print('Running grid search : ')
grid_search = GridSearchCV(estimator=log_model, param_grid=log_param_grid, n_jobs=-1, cv=cv, scoring=['roc_auc','accuracy','f1','recall','precision'],refit='f1')
print('Fit grid search : ')
grid_result = grid_search.fit(XS_train_ohe[:,rfe.support_], y_train.flatten())
print('store logistic grid results')
pickle.dump( grid_result, open( "data/logistic_final_grid_result.dat", "wb" ))
Grid search runs for all grid parameters and finds best from all combinations in runs in best_score_ and best_params_.
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
Best: 0.581015 using {'C': 0.6, 'max_iter': 500, 'penalty': 'l2', 'random_state': 1999, 'solver': 'lbfgs'}
Grid Search Result
log_score = pd.DataFrame.from_dict(grid_result.cv_results_)
log_score.head()
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_C | param_max_iter | param_penalty | param_random_state | param_solver | params | ... | split23_test_precision | split24_test_precision | split25_test_precision | split26_test_precision | split27_test_precision | split28_test_precision | split29_test_precision | mean_test_precision | std_test_precision | rank_test_precision | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 13.432851 | 1.474184 | 0.040597 | 0.005634 | 0.1 | 500 | l2 | 1999 | sag | {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '... | ... | 0.668673 | 0.670941 | 0.657928 | 0.670738 | 0.659365 | 0.656282 | 0.665728 | 0.665584 | 0.004726 | 8 |
1 | 0.744260 | 0.051803 | 0.042062 | 0.006093 | 0.1 | 500 | l2 | 1999 | lbfgs | {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', '... | ... | 0.668673 | 0.670857 | 0.658100 | 0.670738 | 0.659365 | 0.656282 | 0.665899 | 0.665599 | 0.004716 | 7 |
2 | 10.501819 | 1.227208 | 0.043512 | 0.009269 | 0.01 | 500 | l2 | 1999 | sag | {'C': 0.01, 'max_iter': 500, 'penalty': 'l2', ... | ... | 0.669011 | 0.671036 | 0.657742 | 0.670484 | 0.660010 | 0.656353 | 0.665642 | 0.665791 | 0.004806 | 5 |
3 | 0.789555 | 0.114426 | 0.044956 | 0.009950 | 0.01 | 500 | l2 | 1999 | lbfgs | {'C': 0.01, 'max_iter': 500, 'penalty': 'l2', ... | ... | 0.669011 | 0.670951 | 0.657742 | 0.670398 | 0.659926 | 0.656353 | 0.665812 | 0.665777 | 0.004801 | 6 |
4 | 4.774041 | 1.120333 | 0.041310 | 0.010212 | 0.001 | 500 | l2 | 1999 | sag | {'C': 0.001, 'max_iter': 500, 'penalty': 'l2',... | ... | 0.671425 | 0.672234 | 0.659706 | 0.673286 | 0.663155 | 0.659015 | 0.668317 | 0.667550 | 0.005075 | 4 |
5 rows × 175 columns
grid_result
GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=10, random_state=1999), estimator=LogisticRegression(random_state=1999), n_jobs=-1, param_grid=[{'C': [0.1, 0.01, 0.001, 0.0001, 0.5, 0.6, 10], 'max_iter': [500], 'penalty': ['l2'], 'random_state': [1999], 'solver': ['sag', 'lbfgs']}], refit='f1', scoring=['roc_auc', 'accuracy', 'f1', 'recall', 'precision'])
log_sag_df = log_score[log_score['param_solver']=='lbfgs']
log_sag = {}
for column in log_sag_df.columns:
log_sag[column] = log_sag_df[column].to_numpy()
scoring = {'accuracy': make_scorer(accuracy_score),'recall': make_scorer(recall_score),
'precision': make_scorer(precision_score),'f1': make_scorer(f1_score),
'roc_auc' : 'roc_auc'
}
X_sag = np.array(log_sag['param_C'], dtype=float)
viz_hyperparameter(log_sag,X_sag,xlabel="Parameter C",scoring=scoring,
xlim=[0,15],ylim=[0.35,.85],plot_title=" : Logistic lbfgs solver")
Grid search result obained above was built with all evaluation metric against hyperparameter C:
C=0.6 is the optimal value found for the logistic regression. It is quite evident from the plot that precision,recall, f1 score and accuracy remains constant if we increase value of C. It cannot be tuned beyond 0.6
logit_model = LogisticRegression(C=0.6, max_iter = 500,
penalty = 'l2', random_state=1999,
solver='lbfgs')
rfe = RFE(logit_model,n_features_to_select=60, step=1)
rfe = rfe.fit(XS_train_ohe, y_train)
logit_model = logit_model.fit(XS_train_ohe[:,rfe.support_],y_train)
model_coef = np.argsort(logit_model.coef_[0])[::-1]
importance = logit_model.coef_[0][model_coef]
plt.figure(figsize=(10,10))
x1=sns.barplot(x=importance, y=list(X_test_ohe.columns[model_coef]))
y_hat = np.zeros(y_train.shape)
y_hat = logit_model.predict(XS_test_ohe[:,rfe.support_])
cm = confusion_matrix(y_target=y_test, y_predicted=y_hat)
acc = accuracy_score(y_test, y_hat)
recall= recall_score(y_test, y_hat)
precision= precision_score(y_test, y_hat)
f1= f1_score(y_test, y_hat)
from mlxtend.plotting import plot_confusion_matrix
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title('Confusion Matrix')
plt.show()
print("Accuracy : ", acc)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1-Score : ", f1)
Accuracy : 0.7055778275829414 Precision : 0.6728313068355982 Recall : 0.518095987411487 F1-Score : 0.5854113881850913
cost_matrix = np.array([[0, 10],[500, 0]])
fig, ax = plot_confusion_matrix(conf_mat=cost_matrix,cmap="YlOrRd")
plt.title('Cost Matrix')
plt.show()
cost_calc = np.multiply(cm, cost_matrix)
fig, ax = plot_confusion_matrix(conf_mat=cost_calc,cmap="OrRd")
plt.title('Cost Calculation')
plt.show()
y_score = logit_model.predict_proba(XS_test_ohe[:,rfe.support_])
logit_probs = y_score[:, 1]
# predict class values
logit_precision, logit_recall, logit_thresholds = precision_recall_curve(y_test, logit_probs)
logit_f1, logit_auc = f1_score(y_test, y_hat), auc(logit_recall, logit_precision)
# summarize scores
print('Logistic Regression : f1=%.3f auc=%.3f' % (logit_f1, logit_auc))
pyplot.plot(logit_recall, logit_precision, marker='.', label='Logistic Regression')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Logistic Regression : f1=0.585 auc=0.676
plot_metrics_cost_vs_threshold(y_test, logit_probs,0.05,cost_matrix)
$ Cost and Classification Error Rates for different classification thresholds: Threshold $Cost FNr FPr Acc 0 0.006807 189690 0.000000 1.000000 0.401212 1 0.008014 190190 0.000079 1.000000 0.401181 2 0.012172 190690 0.000157 1.000000 0.401149 3 0.014811 190680 0.000157 0.999947 0.401181 4 0.016882 190670 0.000157 0.999895 0.401212 ... ... ... ... ... ... 31559 0.949043 6295510 0.990637 0.000053 0.602513 31560 0.949097 6296010 0.990716 0.000053 0.602481 31561 0.949692 6296510 0.990795 0.000053 0.602450 31562 0.950181 6297010 0.990873 0.000053 0.602418 31563 0.950191 6297000 0.990873 0.000000 0.602450 [31564 rows x 5 columns]
knn_model = KNeighborsClassifier()
knn_model.get_params()
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
if(path.exists("data/knn_final_grid_result.dat")):
knn_grid_result = pickle.load(open("data/knn_final_grid_result.dat", "rb"))
else:
knn_model = KNeighborsClassifier()
cs07_random_state_ = 1999
knn_param_grid = [
{'n_neighbors' : np.arange(10,14,2)} ]
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1999)
grid_search = GridSearchCV(estimator=knn_model,
param_grid=knn_param_grid, n_jobs=-1, cv=cv,
scoring='f1',error_score=0)
knn_grid_result = grid_search.fit(XS_train_ohe, y_train.flatten())
pickle.dump( grid_result, open( "data/knn_final_grid_result.dat", "wb" ))
# summarize results
print("Best: %f using %s" % (knn_grid_result.best_score_, knn_grid_result.best_params_))
Best: 0.694441 using {'n_neighbors': 12}
Grid Search Result
knn_grid = pd.DataFrame.from_dict(knn_grid_result.cv_results_)
knn_grid
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_n_neighbors | params | split0_test_roc_auc | split1_test_roc_auc | split2_test_roc_auc | split3_test_roc_auc | ... | split23_test_precision | split24_test_precision | split25_test_precision | split26_test_precision | split27_test_precision | split28_test_precision | split29_test_precision | mean_test_precision | std_test_precision | rank_test_precision | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.305139 | 0.084233 | 296.612833 | 13.084617 | 10 | {'n_neighbors': 10} | 0.872573 | 0.873158 | 0.870447 | 0.865345 | ... | 0.834268 | 0.828769 | 0.821903 | 0.852197 | 0.834430 | 0.834135 | 0.844618 | 0.835537 | 0.005922 | 2 |
1 | 0.310254 | 0.188162 | 286.185080 | 55.575974 | 12 | {'n_neighbors': 12} | 0.879290 | 0.879120 | 0.878307 | 0.870748 | ... | 0.849788 | 0.837100 | 0.823304 | 0.854698 | 0.837368 | 0.842690 | 0.851820 | 0.841920 | 0.006669 | 1 |
2 rows × 171 columns
knn_neighbhor = {}
for column in knn_grid.columns:
knn_neighbhor[column] = knn_grid[column].to_numpy()
scoring = {'accuracy': make_scorer(accuracy_score),'recall': make_scorer(recall_score),
'precision': make_scorer(precision_score),'f1': make_scorer(f1_score),
'roc_auc' : 'roc_auc'
}
n_neighbhor = np.array(knn_neighbhor['param_n_neighbors'], dtype=float)
viz_hyperparameter(knn_neighbhor,n_neighbhor,xlabel="No. of Neighbhors",scoring=scoring,
xlim=[10,15],ylim=[0.5,.95],plot_title=" : KNN ")
Grid search found n_neighbor=12 with better recall and selected for the predictions.
knn = KNeighborsClassifier(n_neighbors=12)
model_knn = knn.fit(XS_train_ohe, y_train)
y_hat = np.zeros(y_train.shape)
y_hat = model_knn.predict(XS_test_ohe)
cm = confusion_matrix(y_target=y_test, y_predicted=y_hat)
acc= accuracy_score(y_test, y_hat)
recall= recall_score(y_test, y_hat)
precision= precision_score(y_test, y_hat)
f1= f1_score(y_test, y_hat)
from mlxtend.plotting import plot_confusion_matrix
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title('Confusion Matrix')
plt.show()
print("Accuracy: ", acc)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1-Score : ", f1)
Accuracy: 0.7985100539789766 Precision : 0.8527149069015498 Recall : 0.6017309205350118 F1-Score : 0.7055675999815488
cost_matrix = np.array([[0, 10],[500, 0]])
fig, ax = plot_confusion_matrix(conf_mat=cost_matrix,cmap="YlOrRd")
plt.title('Cost Matrix')
plt.show()
cost_calc = np.multiply(cm, cost_matrix)
fig, ax = plot_confusion_matrix(conf_mat=cost_calc,cmap="OrRd")
plt.title('Cost Calculation')
plt.show()
y_score = model_knn.predict_proba(XS_test_ohe)
knn_probs = y_score[:, 1]
# predict class values
knn_precision, knn_recall, knn_thresholds = precision_recall_curve(y_test, knn_probs)
knn_f1, knn_auc = f1_score(y_test, y_hat), auc(knn_recall, knn_precision)
# summarize scores
print('KNN: f1=%.3f auc=%.3f' % (knn_f1, knn_auc))
pyplot.plot(knn_recall, knn_precision, marker='.', label='KNN')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
KNN: f1=0.706 auc=0.849
plot_metrics_cost_vs_threshold(y_test, knn_probs,0.05,cost_matrix)
$ Cost and Classification Error Rates for different classification thresholds: Threshold $Cost FNr FPr Acc 0 0.000000 189690 0.000000 1.000000 0.401212 1 0.083333 182010 0.001967 0.893616 0.464124 2 0.166667 193070 0.008261 0.741051 0.552953 3 0.250000 283680 0.027695 0.567663 0.648979 4 0.333333 538740 0.072777 0.401919 0.730137 5 0.416667 1006540 0.150747 0.255891 0.786294 6 0.500000 1671050 0.258694 0.142601 0.810821 7 0.583333 2544210 0.398269 0.069640 0.798510 8 0.666667 3470750 0.545240 0.030313 0.763092 9 0.750000 4360730 0.685838 0.011756 0.717794 10 0.833333 5144660 0.809441 0.003479 0.673159 11 0.916667 5725190 0.900865 0.001002 0.637962 12 1.000000 6123010 0.963493 0.000053 0.613403
#results = permutation_importance(model_knn, XS_train_ohe, y_train, scoring='recall')
# get importance
#importance = results.importances_mean
# summarize feature importance
#for i,v in enumerate(importance):
# print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
#pyplot.bar([x for x in range(len(importance))], importance)
#pyplot.show()
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=1999)
rf_model.get_params()
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1999, 'verbose': 0, 'warm_start': False}
if(path.exists("data/rf_grid_result.dat")):
rf_grid_result = pickle.load(open("data/rf_grid_result.dat", "rb"))
else:
cs07_random_state_ = 1999
rf_model = RandomForestClassifier(random_state=cs07_random_state_)
#
# Run Grid Search
#
rf_param_grid = [
{'max_features' : np.arange(1,6,1),
'n_estimators' : np.arange(10,50,10),
'random_state' : [cs07_random_state_],
'n_jobs' : [-1]
} ]
rf_scoring = { 'ROC' : 'roc_auc',
'Accuracy' : make_scorer(accuracy_score),
'Recall' : make_scorer(recall_score),
'Precision' : make_scorer(precision_score),
'F1' : make_scorer(f1_score)
}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1999)
print('Running grid search : ')
grid_search = GridSearchCV(estimator=rf_model,
param_grid=rf_param_grid, n_jobs=-1, cv=cv,
scoring=['roc_auc','accuracy','f1','recall','precision'],refit='f1')
print('Fit grid search : ')
grid_result = grid_search.fit(XS_train_ohe, y_train.flatten())
pickle.dump( grid_result, open( "data/rf_grid_result.dat", "wb" ))
This is the best model returned by Grid Search hyperparameter tuning.
# summarize results
print("Best: %f using %s" % (rf_grid_result.best_score_, rf_grid_result.best_params_))
Best: 0.870298 using {'max_features': 5, 'n_estimators': 40, 'n_jobs': -1, 'random_state': 1999}
rf_grid = pd.DataFrame.from_dict(rf_grid_result.cv_results_)
rf_grid.head()
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_features | param_n_estimators | param_n_jobs | param_random_state | params | split0_test_roc_auc | ... | split23_test_precision | split24_test_precision | split25_test_precision | split26_test_precision | split27_test_precision | split28_test_precision | split29_test_precision | mean_test_precision | std_test_precision | rank_test_precision | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.678497 | 0.269283 | 0.247049 | 0.010967 | 1 | 10 | -1 | 1999 | {'max_features': 1, 'n_estimators': 10, 'n_job... | 0.809679 | ... | 0.758632 | 0.759348 | 0.749256 | 0.769534 | 0.760996 | 0.754995 | 0.776630 | 0.764207 | 0.012129 | 20 |
1 | 9.528533 | 0.469264 | 0.259123 | 0.021618 | 1 | 20 | -1 | 1999 | {'max_features': 1, 'n_estimators': 20, 'n_job... | 0.850978 | ... | 0.818266 | 0.807516 | 0.801144 | 0.830425 | 0.809205 | 0.808978 | 0.833385 | 0.817380 | 0.009593 | 19 |
2 | 14.978017 | 0.658647 | 0.282651 | 0.034041 | 1 | 30 | -1 | 1999 | {'max_features': 1, 'n_estimators': 30, 'n_job... | 0.876086 | ... | 0.838897 | 0.834249 | 0.840244 | 0.852968 | 0.843693 | 0.843120 | 0.854378 | 0.847270 | 0.008565 | 17 |
3 | 19.508908 | 0.737629 | 0.367102 | 0.081046 | 1 | 40 | -1 | 1999 | {'max_features': 1, 'n_estimators': 40, 'n_job... | 0.886774 | ... | 0.857915 | 0.857403 | 0.857315 | 0.866707 | 0.853622 | 0.848503 | 0.875188 | 0.862306 | 0.007948 | 16 |
4 | 7.073599 | 0.840057 | 0.252776 | 0.011082 | 2 | 10 | -1 | 1999 | {'max_features': 2, 'n_estimators': 10, 'n_job... | 0.866640 | ... | 0.825772 | 0.827779 | 0.834115 | 0.842350 | 0.829114 | 0.843328 | 0.841470 | 0.835455 | 0.006917 | 18 |
5 rows × 174 columns
grid_results = rf_grid.loc[:,['param_max_features','param_n_estimators','mean_test_recall']]
grid_contour = grid_results.groupby(['param_max_features','param_n_estimators']).mean()
grid_contour
grid_reset = grid_contour.reset_index()
grid_reset.columns = ['param_max_features', 'param_n_estimators', 'mean_test_recall']
grid_pivot = grid_reset.pivot('param_max_features', 'param_n_estimators')
grid_pivot
x = grid_pivot.columns.levels[1].values
y = grid_pivot.index.values
z = grid_pivot.values
This is 3D plot after tunining following hyperparameters
import plotly.graph_objects as go
# X and Y axes labels
layout = go.Layout(
xaxis=go.layout.XAxis(
title=go.layout.xaxis.Title(
text='n_estimators')
),
yaxis=go.layout.YAxis(
title=go.layout.yaxis.Title(
text='max_features')
) )
fig = go.Figure(data= [go.Surface(z=z, y=y, x=x)], layout=layout )
fig.update_layout(title='Hyperparameter tuning',
scene = dict(
xaxis_title='param_n_estimators',
yaxis_title='param_max_features',
zaxis_title='mean_test_recall'),
autosize=False,
width=600, height=500,
margin=dict(l=65, r=50, b=65, t=90))
fig.show()
rf_model = RandomForestClassifier(max_features=5,
n_estimators=40,
n_jobs=-1,
random_state=1999)
rf_model.fit(XS_train_ohe, y_train)
RandomForestClassifier(max_features=5, n_estimators=40, n_jobs=-1, random_state=1999)
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(20,6))
plt.title("Random Forest feature importances (top 40 features) ")
plt.bar(range(XS_train_ohe.shape[1]),importances[indices],
color="#FF5733", yerr=std[indices], align="center")
plt.xticks(range(XS_train_ohe.shape[1]), X_test_ohe.columns[indices])
plt.xlim([-1, 40])
plt.show()
y_hat = np.zeros(y_train.shape)
y_hat = rf_model.predict(XS_test_ohe)
cm = confusion_matrix(y_target=y_test, y_predicted=y_hat)
acc = accuracy_score(y_test, y_hat)
recall= recall_score(y_test, y_hat)
precision= precision_score(y_test, y_hat)
f1= f1_score(y_test, y_hat)
from mlxtend.plotting import plot_confusion_matrix
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.title('Confusion Matrix')
plt.show()
print("Accuracy : ", acc)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1-Score : ", f1)
Accuracy : 0.904984374506771 Precision : 0.9209338656483249 Recall : 0.8348544453186467 F1-Score : 0.8757840871574777
cost_matrix = np.array([[0, 10],[500, 0]])
fig, ax = plot_confusion_matrix(conf_mat=cost_matrix,cmap="YlOrRd")
plt.title('Cost Matrix')
plt.show()
cost_calc = np.multiply(cm, cost_matrix)
fig, ax = plot_confusion_matrix(conf_mat=cost_calc,cmap="OrRd")
plt.title('Cost Calculation')
plt.show()
y_score = rf_model.predict_proba(XS_test_ohe)
rf_probs = y_score[:, 1]
# predict class values
rf_precision, rf_recall, rf_thresholds = precision_recall_curve(y_test, rf_probs)
rf_f1, rf_auc = f1_score(y_test, y_hat), auc(rf_recall, rf_precision)
# summarize scores
print('Random Forest : f1=%.3f auc=%.3f' % (rf_f1, rf_auc))
pyplot.plot(rf_recall, rf_precision, marker='.', label='Random Forest')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Random Forest : f1=0.876 auc=0.952
plot_metrics_cost_vs_threshold(y_test, rf_probs,0.05,cost_matrix)
$ Cost and Classification Error Rates for different classification thresholds: Threshold $Cost FNr FPr Acc 0 0.000 189690 0.000000 1.000000 0.401212 1 0.025 184740 0.000079 0.971269 0.418384 2 0.050 175670 0.000472 0.910275 0.454749 3 0.075 169030 0.001652 0.835732 0.498911 4 0.100 160690 0.002754 0.754863 0.546892 5 0.125 155350 0.004327 0.673994 0.594684 6 0.150 149060 0.005744 0.593389 0.642381 7 0.175 146450 0.007553 0.519005 0.686196 8 0.200 150310 0.010307 0.447098 0.728148 9 0.225 161550 0.013926 0.385102 0.763818 10 0.250 187280 0.019670 0.328325 0.795511 11 0.275 219960 0.026200 0.281828 0.820733 12 0.300 250800 0.032337 0.238811 0.844029 13 0.325 286300 0.039024 0.201908 0.863443 14 0.350 344930 0.049174 0.170963 0.877900 15 0.375 407540 0.059795 0.145184 0.889075 16 0.400 485790 0.072777 0.122779 0.897282 17 0.425 567590 0.086310 0.100638 0.905111 18 0.450 679880 0.104485 0.083716 0.907952 19 0.475 798350 0.123525 0.070378 0.908299 20 0.500 925390 0.143902 0.057409 0.907889 21 0.525 1058610 0.165146 0.048026 0.904984 22 0.550 1205000 0.188434 0.039538 0.900723 23 0.575 1388590 0.217545 0.032105 0.893494 24 0.600 1590880 0.249567 0.025726 0.884466 25 0.625 1795290 0.281904 0.019980 0.874933 26 0.650 2015440 0.316680 0.015499 0.863664 27 0.675 2267340 0.356412 0.012336 0.849616 28 0.700 2523820 0.396853 0.009595 0.835033 29 0.725 2832920 0.445555 0.007486 0.816756 30 0.750 3156030 0.496459 0.005430 0.797563 31 0.775 3490280 0.549095 0.004112 0.777234 32 0.800 3835030 0.603383 0.002794 0.756242 33 0.825 4221900 0.664280 0.002109 0.732220 34 0.850 4610780 0.725492 0.001476 0.708040 35 0.875 4940720 0.777419 0.001160 0.687395 36 0.900 5310120 0.835563 0.000633 0.664383 37 0.925 5660580 0.890716 0.000422 0.642381 38 0.950 5941550 0.934933 0.000264 0.624736 39 0.975 6166020 0.970260 0.000105 0.610657 40 1.000 6298510 0.991109 0.000053 0.602323
Table 7.4.1 - Performance Vs Dollar Cost
Model | Accuracy | Precision | Recall | F1 Score | Dollar Cost |
---|---|---|---|---|---|
Logistic Regression | 0.70 | 0.67 | 0.51 | 0.58 | 3,062,500 |
KNN | 0.79 | 0.85 | 0.60 | 0.70 | 2,531,000 |
Random Forest | 0.90 | 0.92 | 0.83 | 0.87 | 1,049,500 |
Random forest model provides the best performance by minimizing the false negatives, and provides best (highest) recall. That also is evident from lost $cost for Random Forest.
Discussion on model performance
Random Forest is an ensemble learning method that consists of multiple decision trees. An ensemble learning method ends up providing better metric, as it’s possible to train trees addressing a specific feature space in data set that other trees don’t.
Logistic regression performs the poorest here. Logistic regression has assumption of linear dependence between dependent variable and independent features. It is possible the available data set is not linearly separable, providing poorest model metric.
KNN performs between Random forest and logistic regression. Given the number of feature set and final 'y' variable, it is possible the data set lends itself well into determining output based on how close the features are.
It is possible that this data could be for financial fraud detection, where y=1 implies fraud detected. If this is true, then it is likely that majority of frauds have a signature in feature set that is common. This common feature set would likely drive better performance for KNN as commonality in feature ties to nearest neighbor approach for determining a fraud. Hence it is better than logistic regression for this problem.
Discussion on Dollar cost vs. model metric
Dollar cost and model metric tradeoffs are further analyzed by varying the binary classification threshold. Table 7.4.2 below captures binary classification threshold that minimizes Dollar cost, for a model originally obtained through grid search and hyper parameter tuning, as summarized in Table 7.4.1.
Table 7.4.2 - Threshold selection
Model | Classification threshold | FNR | Accuracy | Dollar Cost |
---|---|---|---|---|
Logistic Regression | 0.006807 | 0.0 | 0.4012 | 189,690 |
KNN | 0.083333 | 0.0020 | 0.4641 | 182,010 |
Random Forest | 0.175 | 0.0076 | 0.6862 | 146,450 |
Observations
For each of the models, the \$ cost is significantly reduced by using a much lower classification threshold. Compare with default results in Table 7.4.1.
Using a very low threshold is converting TN and FN to FP (false positives). Since the cost of FP is smaller by 50x than of FN, it is helping lower the actual \$ cost to company. Above also implies, one need not build any sophisticated model, and classify all new records as TRUE, yet not be too worse off the minimum \$ cost. This is happening as the $ cost of FN = 50x \$ cost of FP, i.e. there is a very high skew.
Experiments with setting \$ cost of FN = 500, and $ cost of FP = 100, (i.e. much less skew),gave a much higher Accuracy, and higher threshold. In such a scenario, spending time and effort to build a model will likely provide higher return on investment of building a model
model_coef = np.argsort(logit_model.coef_[0])[::-1]
importance = logit_model.coef_[0][model_coef]
plt.figure(figsize=(10,10))
x1=sns.barplot(x=importance, y=list(X_test_ohe.columns[model_coef]))
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(20,6))
plt.title("Random Forest feature importances (top 40 features) ")
plt.bar(range(XS_train_ohe.shape[1]),importances[indices],
color="#FF5733", yerr=std[indices], align="center")
plt.xticks(range(XS_train_ohe.shape[1]), X_test_ohe.columns[indices])
plt.xlim([-1, 40])
plt.show()
Feature importance refers to a class of techniques that measure how useful input features are for predicting a target variable using a given model. The score assigned alludes to the relative importance of a feature in a particular model. It can also used to better understanding a model’s logic or reduce the number of input variables. Feature importance is not defined for KNN algorithms, but it is available for tree models.
The feature importance result for logistic regression model prioritizes x46, x38, x35, x37 variables whereas in the random forest listed the best/importance variables are x23, x20, x49, x48, x42, x28, x12, x40, x37, x27, x7, x41, x38, x46, x6, x2, x32. Even though there is difference list hierarchy and magnitude both models list x46, x38, x37 as the top important variables. That is both tree models identified similar features as most important for predicting product formulation. This indicates that model performance is consistent and that the appropriate variables are being included in our model for prediction.
As depicted in table 7.4.1 random forest is the best model with cost of \$1,049,500
The analysis involved building a binary classification model, of unknown dataset, with primary ask to minimize the \$ cost.
The \$ cost penalized False Negatives (FN) much higher than False Positives (FP), with numbers being \$500 and $10 respectively.
Logistic Regression, KNN and Random Forest models were tuned for best recall score, and compared for \$ cost performance. It is clear that Random Forest does the best, by providing lowest $ cost, while providing a high Recall value.
However, a careful analysis of choice of binary classification threshold, indicated that one can minimize the \$ cost further, and achieve False Negative Rate to be ‘0’ for Logistic regression, and just 0.8% for Random Forest. Interestingly, Logistic regression result indicates that one doesn’t need a sophisticated model to minimize \$ cost. The cost impact of FN vs FP is so skewed, that a very low threshold or effectively assuming all records are TRUE still gives minimal cost. Further, Random Forest model does better than logistic regression by minimizes cost further by ~ \$40,000.
Above brings to fore a tradeoff between optimizing for minimal \$ cost, vs a good model with high accuracy. One can have a good model, but may not have lowest \$ cost. On the other hand, if \$ cost reduction is the ONLY criterion, we may not need a complicated model, or detailed modeling activity. The \$ cost penalty for FN and FP have a significant bearing on development, and deployment of the right model. It is very important that we have right estimate of cost of FN and FP. It will be also important to ask if cost of FN and FP can vary, depending on some of the features in the data set. E.g. the cost may be different based on geographical location (note that data set does appear to have a variable encoding geographical area). All of these needs to be considered before considering to productize and use this methodology.
This code was used to run grid search for logisitc regression after applying recursive feature elimination. Grid search takes time to run therefore it was executed separately as background process and results were stored in csv file.
print('Process Begins')
X_train = pd.read_csv('data/train_set.csv',sep=',')
X_test = pd.read_csv('data/test_set.csv',sep=',')
y_train_target = pd.read_csv('data/train_target.csv',sep=',')
y_test_target = pd.read_csv('data/test_target.csv',sep=',')
XS_train_ohe = X_train.values
XS_test_ohe = X_test.values
y_train = y_train_target.values
y_test = y_test_target.values
log_model = LogisticRegression(random_state=1999)
cs07_random_state_ = 1999
log_param_grid = [
{'penalty' : ['l2'],
'C' : [0.1,0.01,0.001,0.0001,0.5,0.6,10],
'solver' : ['sag','lbfgs'],
'max_iter' : [500],
'random_state' : [cs07_random_state_]}
]
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1999)
print('Running RFE : ')
rfe = RFE(log_model,n_features_to_select=61, step=1)
rfe = rfe.fit(XS_train_ohe, y_train.flatten())
print(rfe.support_)
print(rfe.ranking_)
print('Running grid search : ')
grid_search = GridSearchCV(estimator=log_model, param_grid=log_param_grid, n_jobs=-1, cv=cv, scoring='recall')
print('Fit grid search : ')
grid_result = grid_search.fit(XS_train_ohe[:,rfe.support_], y_train.flatten())
print('store logistic grid results')
pickle.dump( grid_result, open( "data/logistic_final_grid_result.dat", "wb" ))
print('End of Process')
$ Cost and Classification Error Rates for different classification thresholds:
Threshold $Cost FNr FPr Acc
0 0.006807 189690 0.000000 1.000000 0.401212
1 0.008014 190190 0.000079 1.000000 0.401181
2 0.012172 190690 0.000157 1.000000 0.401149
3 0.014811 190680 0.000157 0.999947 0.401181
4 0.016882 190670 0.000157 0.999895 0.401212
... ... ... ... ... ...
31559 0.949043 6295510 0.990637 0.000053 0.602513
31560 0.949097 6296010 0.990716 0.000053 0.602481
31561 0.949692 6296510 0.990795 0.000053 0.602450
31562 0.950181 6297010 0.990873 0.000053 0.602418
31563 0.950191 6297000 0.990873 0.000000 0.602450
[31564 rows x 5 columns]
print('Process Begins')
X_train = pd.read_csv('data/train_set.csv',sep=',')
X_test = pd.read_csv('data/test_set.csv',sep=',')
y_train_target = pd.read_csv('data/train_target.csv',sep=',')
y_test_target = pd.read_csv('data/test_target.csv',sep=',')
XS_train_ohe = X_train.values
XS_test_ohe = X_test.values
y_train = y_train_target.values
y_test = y_test_target.values
knn_model = KNeighborsClassifier()
cs07_random_state_ = 1999
knn_param_grid = [
{'n_neighbors' : np.arange(5,40,2)} ]
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=cs07_random_state_)
print('Running grid search : ')
grid_search = GridSearchCV(estimator=knn_model, param_grid=knn_param_grid, n_jobs=-1, cv=cv, scoring=['roc_auc','accuracy','f1','recall','precision'],refit='f1')
print('Fit grid search : ')
grid_result = grid_search.fit(XS_train_ohe, y_train.flatten())
print('store knn grid results')
pickle.dump( grid_result, open( "data/knn_final_grid_result.dat", "wb" ))
print('End of Process')
$ Cost and Classification Error Rates for different classification thresholds:
Threshold $Cost FNr FPr Acc
0 0.000000 1896900 0.000000 1.000000 0.401212
1 0.083333 1707600 0.001967 0.893616 0.464124
2 0.166667 1458200 0.008261 0.741051 0.552953
3 0.250000 1252800 0.027695 0.567663 0.648979
4 0.333333 1224900 0.072777 0.401919 0.730137
5 0.416667 1443400 0.150747 0.255891 0.786294
6 0.500000 1914500 0.258694 0.142601 0.810821
7 0.583333 2663100 0.398269 0.069640 0.798510
8 0.666667 3522500 0.545240 0.030313 0.763092
9 0.750000 4380800 0.685838 0.011756 0.717794
10 0.833333 5150600 0.809441 0.003479 0.673159
11 0.916667 5726900 0.900865 0.001002 0.637962
12 1.000000 6123100 0.963493 0.000053 0.613403
$ Cost and Classification Error Rates for different classification thresholds:
Threshold $Cost FNr FPr Acc
0 0.000 189690 0.000000 1.000000 0.401212
1 0.025 184740 0.000079 0.971269 0.418384
2 0.050 175670 0.000472 0.910275 0.454749
3 0.075 169030 0.001652 0.835732 0.498911
4 0.100 160690 0.002754 0.754863 0.546892
5 0.125 155350 0.004327 0.673994 0.594684
6 0.150 149060 0.005744 0.593389 0.642381
7 0.175 146450 0.007553 0.519005 0.686196
8 0.200 150310 0.010307 0.447098 0.728148
9 0.225 161550 0.013926 0.385102 0.763818
10 0.250 187280 0.019670 0.328325 0.795511
11 0.275 219960 0.026200 0.281828 0.820733
12 0.300 250800 0.032337 0.238811 0.844029
13 0.325 286300 0.039024 0.201908 0.863443
14 0.350 344930 0.049174 0.170963 0.877900
15 0.375 407540 0.059795 0.145184 0.889075
16 0.400 485790 0.072777 0.122779 0.897282
17 0.425 567590 0.086310 0.100638 0.905111
18 0.450 679880 0.104485 0.083716 0.907952
19 0.475 798350 0.123525 0.070378 0.908299
20 0.500 925390 0.143902 0.057409 0.907889
21 0.525 1058610 0.165146 0.048026 0.904984
22 0.550 1205000 0.188434 0.039538 0.900723
23 0.575 1388590 0.217545 0.032105 0.893494
24 0.600 1590880 0.249567 0.025726 0.884466
25 0.625 1795290 0.281904 0.019980 0.874933
26 0.650 2015440 0.316680 0.015499 0.863664
27 0.675 2267340 0.356412 0.012336 0.849616
28 0.700 2523820 0.396853 0.009595 0.835033
29 0.725 2832920 0.445555 0.007486 0.816756
30 0.750 3156030 0.496459 0.005430 0.797563
31 0.775 3490280 0.549095 0.004112 0.777234
32 0.800 3835030 0.603383 0.002794 0.756242
33 0.825 4221900 0.664280 0.002109 0.732220
34 0.850 4610780 0.725492 0.001476 0.708040
35 0.875 4940720 0.777419 0.001160 0.687395
36 0.900 5310120 0.835563 0.000633 0.664383
37 0.925 5660580 0.890716 0.000422 0.642381
38 0.950 5941550 0.934933 0.000264 0.624736
39 0.975 6166020 0.970260 0.000105 0.610657
40 1.000 6298510 0.991109 0.000053 0.602323