#!/usr/bin/env python # coding: utf-8 # # # # Fraud Detection # The goal of this case study is to use various classification-based models to detect whether a transaction is a normal payment or a fraud. # # ## Content # * [1. Problem Definition](#0) # * [2. Getting Started - Load Libraries and Dataset](#1) # * [2.1. Load Libraries](#1.1) # * [2.2. Load Dataset](#1.2) # * [3. Exploratory Data Analysis](#2) # * [3.1 Descriptive Statistics](#2.1) # * [3.2. Data Visualisation](#2.2) # * [4. Data Preparation](#3) # * [4.1 Data Cleaning](#3.1) # * [4.2.Feature Selection](#3.2) # * [5.Evaluate Algorithms and Models](#4) # * [5.1. Train/Test Split](#4.1) # * [5.2. Evaluate Models](#4.2) # * [6. Model Tuning](#5) # * [5.1. Model Tuning-Right Evaluation Metric](#5.1) # * [5.2. Model Tuning-Balancing the sample](#5.2) # # # # # 1. Problem Definition # In the classification framework defined for this case study, the response variable takes # a value of 1 in case the given transaction is fraud and 0 otherwise. # # The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.The task is to get forecast the fraud. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.The features are the result of PCA transformation and aren't intuitive as far as their names are concerned. # # The data can be downloaded from: https://www.kaggle.com/mlg-ulb/creditcardfraud # # # # 2. Getting Started- Loading the data and python packages # # ## 2.1. Loading the python packages # In[1]: # Load libraries import numpy as np import pandas as pd from matplotlib import pyplot from pandas import read_csv, set_option from pandas.plotting import scatter_matrix import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier from sklearn.metrics import classification_report, confusion_matrix, accuracy_score #Libraries for Deep Learning Models from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasClassifier from keras.optimizers import SGD #Libraries for Saving the Model from pickle import dump from pickle import load # # ## 2.2. Loading the Data # We load the data in this step. # # #### Note : Due to limit in the github for the data size, a sample of the data has been loaded in the jupyter notebook repository of this book. However, all the subsequent results in this jupyter notebook is with actual data (144MB) under https://www.kaggle.com/mlg-ulb/creditcardfraud. You should load the full data in case you want to reproduce the results. # In[2]: # load dataset dataset = read_csv('creditcard_sample.csv') #dataset = read_csv('creditcard.csv') #Load this for the actual data. # In[53]: #Diable the warnings import warnings warnings.filterwarnings('ignore') # # # 3. Exploratory Data Analysis # # ## 3.1. Descriptive Statistics # In[149]: # shape dataset.shape # In[150]: # peek at data set_option('display.width', 100) dataset.head(5) # In[57]: # types set_option('display.max_rows', 500) dataset.dtypes # As shown in the results above, the entire data type is float, except Class which is integer, and the variable names aren't intuitive. # In[151]: # describe data set_option('precision', 3) dataset.describe() # Let us check the number of fraud vs. non-fraud cases in the data set. # In[155]: class_names = {0:'Not Fraud', 1:'Fraud'} print(dataset.Class.value_counts().rename(index = class_names)) # The dataset is unbalanced with most of the transactions being non-fraud. # # ## 3.2. Data Visualization # In[156]: # histograms dataset.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1, figsize=(12,12)) pyplot.show() # Distribution of most of the variables are highly skewed. However, given the variable names aren't known, we don't get much intuition from the plot. # # ## 4. Data Preparation # In[159]: #Checking for any null values and removing the null values''' print('Null Values =',dataset.isnull().values.any()) # There is no null in the data, and the data is already in the float format, so there is no need to clean or categorise the data # # ## 4.2. Feature Selection # In[160]: from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 bestfeatures = SelectKBest( k=10) bestfeatures Y= dataset["Class"] X = dataset.loc[:, dataset.columns != 'Class'] fit = bestfeatures.fit(X,Y) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(X.columns) #concat two dataframes for better visualization featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Specs','Score'] #naming the dataframe columns print(featureScores.nlargest(10,'Score')) #print 10 best features # Although some of the features are relevant, feature selection is not given significant preference # # # 5. Evaluate Algorithms and Models # # ## 5.1. Train Test Split and Evaluation Metrics # In[200]: # split out validation dataset for the end Y= dataset["Class"] X = dataset.loc[:, dataset.columns != 'Class'] validation_size = 0.2 seed = 7 X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed) scoring = 'accuracy' # # ## 5.2. Checking Models and Algorithms # In[64]: # test options for classification num_folds = 10 seed = 7 # In[162]: # spot check some basic Classification algorithms #Given Data is huge, some of the slower classification algorithms are commented models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) # models.append(('NB', GaussianNB())) # models.append(('SVM', SVC())) # #Neural Network # models.append(('NN', MLPClassifier())) # # #Ensable Models # # Boosting methods # models.append(('AB', AdaBoostClassifier())) # models.append(('GBM', GradientBoostingClassifier())) # # Bagging methods # models.append(('RF', RandomForestClassifier())) # models.append(('ET', ExtraTreesClassifier())) # In[163]: results = [] names = [] for name, model in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # In[164]: # compare algorithms fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(names) fig.set_size_inches(8,4) pyplot.show() # The accuracy is very high, given that accuracy focusses on the overall no fraud case, but lets check how well it predicts the fraud case. Choosing one of the model CART from the results above # In[168]: # prepare model model = DecisionTreeClassifier() model.fit(X_train, Y_train) # In[170]: # estimate accuracy on validation set #rescaledValidationX = scaler.transform(X_validation) rescaledValidationX = X_validation predictions = model.predict(rescaledValidationX) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) # In[179]: df_cm = pd.DataFrame(confusion_matrix(Y_validation, predictions), columns=np.unique(Y_validation), index = np.unique(Y_validation)) df_cm.index.name = 'Actual' df_cm.columns.name = 'Predicted' sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})# font size # Although results are good, but still 21 out of 100 frauds aren't caught. So, we should focus on *recall*, which is a metric which minimises false negative. # # ## 6. Model Tuning # # ## 6.1. Model Tuning by choosing correct evaluation metric # Evaluation Metric recall is selected, which is a metric which minimises false negative. # In[181]: scoring = 'recall' # In[182]: # spot check some basic Classification algorithms #Given Data is huge, some of the slower classification algorithms are commented models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) # models.append(('NB', GaussianNB())) # models.append(('SVM', SVC())) # #Neural Network # models.append(('NN', MLPClassifier())) # # #Ensable Models # # Boosting methods # models.append(('AB', AdaBoostClassifier())) # models.append(('GBM', GradientBoostingClassifier())) # # Bagging methods # models.append(('RF', RandomForestClassifier())) # models.append(('ET', ExtraTreesClassifier())) # In[183]: results = [] names = [] for name, model in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # Given the LDA has the best recall out of all the models, it is used to ealuate the test set # In[184]: # prepare model model = LinearDiscriminantAnalysis() model.fit(X_train, Y_train) # In[185]: # estimate accuracy on validation set #rescaledValidationX = scaler.transform(X_validation) rescaledValidationX = X_validation predictions = model.predict(rescaledValidationX) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) # In[186]: df_cm = pd.DataFrame(confusion_matrix(Y_validation, predictions), columns=np.unique(Y_validation), index = np.unique(Y_validation)) df_cm.index.name = 'Actual' df_cm.columns.name = 'Predicted' sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})# font size # LDA performs much better with only 18 out of cases of fraud not caught. Additionally, false positives are less as well. # However, there are still 18 fraud cases in the test set which aren't caught. This will be further taken care in the following section. # # ## 6.2. Model Tuning for balancing the sample by Random Under Sampling # In this phase of the project we will implement "Random Under Sampling" which basically consists of removing data in order to have a more balanced dataset and thus avoiding our models to overfitting. # # Steps: # 1. The first thing we have to do is determine how imbalanced is our class (use "value_counts()" on the class column to determine the amount for each label) # 2. Once we determine how many instances are considered fraud transactions (Fraud = "1") , we should bring the non-fraud transactions to the same amount as fraud transactions (assuming we want a 50/50 ratio), this will be equivalent to 492 cases of fraud and 492 cases of non-fraud transactions. # 3. After implementing this technique, we have a sub-sample of our dataframe with a 50/50 ratio with regards to our classes. Then the next step we will implement is to shuffle the data to see if our models can maintain a certain accuracy everytime we run this script. # # Note: The main issue with "Random Under-Sampling" is that we run the risk that our classification models will not perform as accurate as we would like to since there is a great deal of information loss (bringing 492 non-fraud transaction from 284,315 non-fraud transaction) # # In[204]: Y_train.head() # In[41]: df = pd.concat([X_train, Y_train], axis=1) # amount of fraud classes 492 rows. fraud_df = df.loc[df['Class'] == 1] non_fraud_df = df.loc[df['Class'] == 0][:492] normal_distributed_df = pd.concat([fraud_df, non_fraud_df]) # Shuffle dataframe rows df_new = normal_distributed_df.sample(frac=1, random_state=42) # split out validation dataset for the end Y_train_new= df_new["Class"] X_train_new = df_new.loc[:, dataset.columns != 'Class'] dataset.head() # In[42]: print('Distribution of the Classes in the subsample dataset') print(df_new['Class'].value_counts()/len(df_new)) sns.countplot('Class', data=df_new) pyplot.title('Equally Distributed Classes', fontsize=14) pyplot.show() # Now that we have our dataframe correctly balanced, we can go further with our analysis and data preprocessing. # Given the total number of data points are around 900, we try all the Models including Deep Learning Models. # However, given the data is balanced, the metric used here is accuracy, as it focuses on both false positive and false negative. # In[43]: scoring='accuracy' # In[44]: # spot check the algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) #Neural Network models.append(('NN', MLPClassifier())) # #Ensable Models # Boosting methods models.append(('AB', AdaBoostClassifier())) models.append(('GBM', GradientBoostingClassifier())) # Bagging methods models.append(('RF', RandomForestClassifier())) models.append(('ET', ExtraTreesClassifier())) # In[45]: #Writing the Deep Learning Classifier in case the Deep Learning Flag is Set to True #Set the following Flag to 1 if the Deep LEarning Models Flag has to be enabled EnableDLModelsFlag = 1 if EnableDLModelsFlag == 1 : # Function to create model, required for KerasClassifier def create_model(neurons=12, activation='relu', learn_rate = 0.01, momentum=0): # create model model = Sequential() model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation=activation)) model.add(Dense(32, activation=activation)) model.add(Dense(1, activation='sigmoid')) # Compile model optimizer = SGD(lr=learn_rate, momentum=momentum) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model models.append(('DNN', KerasClassifier(build_fn=create_model, epochs=50, batch_size=10, verbose=0))) # In[46]: results = [] names = [] for name, model in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train_new, Y_train_new, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # In[47]: # compare algorithms fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(names) fig.set_size_inches(8,4) pyplot.show() # Given that GBM is the best model out of all the models, a grid search is performed for GBM model by varing number of estimators and maximum depth. # In[48]: # Grid Search: GradientBoosting Tuning ''' n_estimators : int (default=100) The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. max_depth : integer, optional (default=3) maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables. ''' n_estimators = [20,180,1000] max_depth= [2, 3,5] param_grid = dict(n_estimators=n_estimators, max_depth=max_depth) model = GradientBoostingClassifier() kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X_train_new, Y_train_new) #Print Results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] ranks = grid_result.cv_results_['rank_test_score'] for mean, stdev, param, rank in zip(means, stds, params, ranks): print("#%d %f (%f) with: %r" % (rank, mean, stdev, param)) # In[49]: # prepare model model = GradientBoostingClassifier(max_depth= 5, n_estimators = 1000) model.fit(X_train_new, Y_train_new) # In[50]: # estimate accuracy on Original validation set predictions = model.predict(X_validation) print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions)) # In[51]: df_cm = pd.DataFrame(confusion_matrix(Y_validation, predictions), columns=np.unique(Y_validation), index = np.unique(Y_validation)) df_cm.index.name = 'Actual' df_cm.columns.name = 'Predicted' sns.heatmap(df_cm, cmap="Blues", annot=True,annot_kws={"size": 16})# font size # The results on the test set are really good and the model performs much better with no case of fraud not caught. # __Conclusion__: # # Choosing the right metric lead to an # enhancement in the fraud cases detected correctly. Under-sampling lead to a significant improvement as all the fraud cases in the test set are correctly identified post under-sampling. # # Under-sampling came with a tradeoff though. In the under-sampled # data our model is unable to detect for a large number of cases non-fraud transactions correctly and instead, misclassifies those non-fraud transactions as fraud cases. #