Answers are at the end of the notebook
# !pip install xgboost
# !pip install -U imbalanced-learn
# !pip install --upgrade scipy
# !pip install threadpoolctl==3.1.0
# !pip install category_encoders
# Data handling
import pandas as pd
import numpy as np
import threadpoolctl
# Vizualisation (Matplotlib, Plotly, Seaborn, etc. )
import seaborn as sns
import matplotlib.pyplot as plt
# EDA (pandas-profiling, etc. )
# Statistics
from scipy import stats
# Feature Processing (Scikit-learn processing, etc. )
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from collections import Counter
# balance data
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# Machine Learning (Scikit-learn Estimators, Catboost, LightGBM, etc. )
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, fbeta_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
# Hyperparameters Fine-tuning (Scikit-learn hp search, cross-validation, etc. )
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# Evaluations
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, RocCurveDisplay, roc_curve, auc
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from statistics import stdev
from sklearn.model_selection import KFold
# Other packages
import os, pickle
import warnings
warnings.filterwarnings("ignore")
# Mounting Google Drive
from google.colab import drive # Import for accessing Google Drive
# Unzipping files
import zipfile # Import for extracting zip files
# Mount your Google Drive
drive.mount('/content/drive')
# Get the file path from Google Drive
file_path = '/content/drive/MyDrive/Colab Notebooks/datasets/Fraud.zip'
# Unzip the file
with zipfile.ZipFile(file_path, 'r') as zip_ref:
# Find the CSV files in the zip folder
file_path = zip_ref.extract('Fraud.csv', '/content/')
# Read the csv file from the url
data = pd.read_csv(file_path)
Mounted at /content/drive
# !pip install ydata-profiling
# from ydata_profiling import ProfileReport
# profile = ProfileReport(data, title="Pandas Profiling Report")
# profile.to_notebook_iframe()
H0: The sample has a Gaussian distribution in the numerical feautures.
H1: The sample does not have a Gaussian distribution in the numerical feautures.
Rename the columns for better readability.
data.rename(columns={
'step': 'time_step',
'type': 'trans_type',
'amount': 'trans_amt',
'nameOrig': 'cust_orig',
'oldbalanceOrg': 'old_orig_bal',
'newbalanceOrig': 'new_orig_bal',
'nameDest': 'cust_dest',
'oldbalanceDest': 'old_dest_bal',
'newbalanceDest': 'new_dest_bal'
}, inplace=True)
data.head()
time_step | trans_type | trans_amt | cust_orig | old_orig_bal | new_orig_bal | cust_dest | old_dest_bal | new_dest_bal | isFraud | isFlaggedFraud | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | PAYMENT | 9839.64 | C1231006815 | 170136.0 | 160296.36 | M1979787155 | 0.0 | 0.0 | 0 | 0 |
1 | 1 | PAYMENT | 1864.28 | C1666544295 | 21249.0 | 19384.72 | M2044282225 | 0.0 | 0.0 | 0 | 0 |
2 | 1 | TRANSFER | 181.00 | C1305486145 | 181.0 | 0.00 | C553264065 | 0.0 | 0.0 | 1 | 0 |
3 | 1 | CASH_OUT | 181.00 | C840083671 | 181.0 | 0.00 | C38997010 | 21182.0 | 0.0 | 1 | 0 |
4 | 1 | PAYMENT | 11668.14 | C2048537720 | 41554.0 | 29885.86 | M1230701703 | 0.0 | 0.0 | 0 | 0 |
create new column to indicate merchants
data['isMerchant'] = data['cust_dest'].apply(lambda x: 1 if x.startswith('M') else 0)
data.shape
(6362620, 12)
# summary of the dataframe
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6362620 entries, 0 to 6362619 Data columns (total 12 columns): # Column Dtype --- ------ ----- 0 time_step int64 1 trans_type object 2 trans_amt float64 3 cust_orig object 4 old_orig_bal float64 5 new_orig_bal float64 6 cust_dest object 7 old_dest_bal float64 8 new_dest_bal float64 9 isFraud int64 10 isFlaggedFraud int64 11 isMerchant int64 dtypes: float64(5), int64(4), object(3) memory usage: 582.5+ MB
for col in data.columns:
if data[col].dtype == 'object':
print(col, data[col].nunique())
trans_type 5 cust_orig 6353307 cust_dest 2722362
cust_orig and cust_dest are unique for each transaction, so we can drop them.
data.drop(columns=['cust_orig', 'cust_dest'], inplace=True)
data.describe()
time_step | trans_amt | old_orig_bal | new_orig_bal | old_dest_bal | new_dest_bal | isFraud | isFlaggedFraud | isMerchant | |
---|---|---|---|---|---|---|---|---|---|
count | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 | 6.362620e+06 |
mean | 2.433972e+02 | 1.798619e+05 | 8.338831e+05 | 8.551137e+05 | 1.100702e+06 | 1.224996e+06 | 1.290820e-03 | 2.514687e-06 | 3.381461e-01 |
std | 1.423320e+02 | 6.038582e+05 | 2.888243e+06 | 2.924049e+06 | 3.399180e+06 | 3.674129e+06 | 3.590480e-02 | 1.585775e-03 | 4.730786e-01 |
min | 1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
25% | 1.560000e+02 | 1.338957e+04 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
50% | 2.390000e+02 | 7.487194e+04 | 1.420800e+04 | 0.000000e+00 | 1.327057e+05 | 2.146614e+05 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
75% | 3.350000e+02 | 2.087215e+05 | 1.073152e+05 | 1.442584e+05 | 9.430367e+05 | 1.111909e+06 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
max | 7.430000e+02 | 9.244552e+07 | 5.958504e+07 | 4.958504e+07 | 3.560159e+08 | 3.561793e+08 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
check for outliers only in the columns that shouldn't have outliers.
# Check of outliers by applying the IQR method checking
df = data.drop(['trans_type', 'time_step', 'isFraud', 'isFlaggedFraud', 'isMerchant'], axis=1)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3-Q1
IQR
((df < (Q1-1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any()
trans_amt True old_orig_bal True new_orig_bal True old_dest_bal True new_dest_bal True dtype: bool
The results above showed there are outliers, but these outliers are probably because of fraudulent transactions.
plt.figure(figsize=(10, 6))
# Plot the boxplot
df.boxplot()
# Rotate x-axis labels by 45 degrees
plt.xticks(rotation=45)
# Display the plot
plt.show()
Outlier Removal Tradeoff: We have to be careful as to how far do we want the threshold for removing outliers. We determine the threshold by multiplying a number (ex: 1.5) by the (Interquartile Range). The higher this threshold is, the less outliers will detect
The Tradeoff: The lower the threshold the more outliers it will remove however, we want to focus more on "extreme outliers" rather than just outliers. Why? because we might run the risk of information loss which will cause our models to have a lower performance
# Check if outliers still exist in the columns
outliers_exist = False
for col in df.columns.tolist():
# Calculate the first and third quartiles (Q1 and Q3)
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
# Calculate the interquartile range (IQR)
IQR = Q3 - Q1
# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Modify the values in the col to be within the range
df[col] = df[col].clip(lower_bound, upper_bound)
# Check if outliers exist in the col
if (df[col] < lower_bound).any() or (df[col] > upper_bound).any():
outliers_exist = True
print(f"Outliers still exist in '{col}'.")
# Apply logarithmic transformation to 'trans_amt' to handle skewness
df = data
df['log_trans_amt'] = np.log1p(df['trans_amt']) # Using log1p to avoid log(0)
# Create a plot with the transformed data
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='log_trans_amt', hue='isFraud', multiple='stack', alpha=0.5)
plt.xlabel('Log of Transaction Amount')
plt.ylabel('Count')
plt.title('Distribution of Transaction Amounts by Fraud Status')
plt.show()
data.drop('log_trans_amt', axis=1, inplace=True)
fig = plt.figure(figsize= (10,8))
sns.countplot(data = data,
x = 'trans_type',
hue = 'isFraud')
plt.show()
let's see for other categories
# Setup figure
cols = 2
rows = 1
fig = plt.figure(figsize= (10,8))
# Plotting
for i, col in enumerate(['isMerchant', 'isFlaggedFraud']):
ax=fig.add_subplot(rows, cols, i+1)
sns.countplot(x=data[col], hue='isFraud', data=data, ax=ax)
plt.xticks(rotation=45, ha='right')
fig.tight_layout()
plt.show()
numeric = [col for col in data.columns if data[col].dtype in ['int64', 'float64']]
numeric.remove('isFraud')
categoric = [col for col in data.columns if data[col].dtype == 'object']
# correlation heatmap df
correlation = data[numeric].corr()
sns.heatmap(correlation, annot=True, fmt='.2f')
<Axes: >
# correlation heatmap df
correlation = data[numeric].corr()
# Get column pairs with correlation > 0.5
high_corr_pairs = []
for i in range(len(correlation.columns)):
for j in range(i+1, len(correlation.columns)):
if abs(correlation.iloc[i, j]) > 0.5:
high_corr_pairs.append((correlation.columns[i], correlation.columns[j], correlation.iloc[i, j]))
# Check if there are high correlation pairs
if high_corr_pairs:
# Print column pairs with correlation > 0.5 and their correlation values
for pair in high_corr_pairs:
print(pair[0], "-", pair[1], "Correlation:", pair[2])
else:
print("No high correlation columns")
old_orig_bal - new_orig_bal Correlation: 0.9988027631723787 old_dest_bal - new_dest_bal Correlation: 0.9765685054474923
these variables have a very high correlation
# so we will drop one from each pair
data.drop(columns=['old_orig_bal', 'old_dest_bal'], inplace=True)
# Define your color palette
color = ['#40DFEF', '#E78EA9']
# Create the box plot
sns.boxplot(data=data, x='isFraud', y='time_step', palette=color)
# Display the plot
plt.show()
This means that on average,
fraudulent transactions were made at later hours, or at a greater time step (hrs)
Normality tests are used to determine if a dataset is normally distributed about the mean value. it is assumed that during any measurement values will follow a normal distribution with an equal number of measurements above and below the mean value.
on the other hand, Gaussian distribution is a continuous probability distribution with symmetrical sides around its center. Its mean, median and mode are equal.
Popular normality tests - D’Agostino’s K^2, Shapiro-Wilk, Anderson-Darling .
# get a list of numerical columns in our dataset
numeric = [col for col in data.columns if data[col].dtype in ['int64', 'float64']]
# loop over each numerical column and test for normality
for col in data[numeric]:
stat, p = stats.normaltest(data[col])
# print('Statistics=%.5f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print(f'looks Gaussian (fail to reject H0) for this column: {col}')
else:
print(f'does not look Gaussian (reject H0) for this column: {col}')
does not look Gaussian (reject H0) for this column: time_step does not look Gaussian (reject H0) for this column: trans_amt does not look Gaussian (reject H0) for this column: new_orig_bal does not look Gaussian (reject H0) for this column: new_dest_bal does not look Gaussian (reject H0) for this column: isFraud does not look Gaussian (reject H0) for this column: isFlaggedFraud does not look Gaussian (reject H0) for this column: isMerchant
data.shape
(6362620, 8)
data.duplicated().sum()
1975
let's have a closer look at the duplicates
data['trans_type'].unique()
array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'], dtype=object)
dup = data.loc[data.duplicated(),:]
dup.head(22)
time_step | trans_type | trans_amt | new_orig_bal | new_dest_bal | isFraud | isFlaggedFraud | isMerchant | |
---|---|---|---|---|---|---|---|---|
11104 | 7 | PAYMENT | 1849.50 | 0.0 | 0.0 | 0 | 0 | 1 |
26143 | 8 | PAYMENT | 1433.34 | 0.0 | 0.0 | 0 | 0 | 1 |
33556 | 8 | PAYMENT | 7759.31 | 0.0 | 0.0 | 0 | 0 | 1 |
51742 | 9 | PAYMENT | 28319.01 | 0.0 | 0.0 | 0 | 0 | 1 |
59969 | 9 | PAYMENT | 2388.93 | 0.0 | 0.0 | 0 | 0 | 1 |
60370 | 9 | PAYMENT | 12099.17 | 0.0 | 0.0 | 0 | 0 | 1 |
60763 | 9 | PAYMENT | 10042.85 | 0.0 | 0.0 | 0 | 0 | 1 |
63096 | 9 | PAYMENT | 2783.83 | 0.0 | 0.0 | 0 | 0 | 1 |
66667 | 9 | PAYMENT | 9494.15 | 0.0 | 0.0 | 0 | 0 | 1 |
69138 | 9 | PAYMENT | 6755.57 | 0.0 | 0.0 | 0 | 0 | 1 |
72575 | 9 | PAYMENT | 6499.28 | 0.0 | 0.0 | 0 | 0 | 1 |
80737 | 10 | PAYMENT | 21758.90 | 0.0 | 0.0 | 0 | 0 | 1 |
82632 | 10 | PAYMENT | 25948.57 | 0.0 | 0.0 | 0 | 0 | 1 |
96959 | 10 | PAYMENT | 1836.59 | 0.0 | 0.0 | 0 | 0 | 1 |
98885 | 10 | PAYMENT | 8837.91 | 0.0 | 0.0 | 0 | 0 | 1 |
100707 | 10 | PAYMENT | 24598.54 | 0.0 | 0.0 | 0 | 0 | 1 |
104645 | 10 | PAYMENT | 6775.92 | 0.0 | 0.0 | 0 | 0 | 1 |
105715 | 10 | PAYMENT | 3237.54 | 0.0 | 0.0 | 0 | 0 | 1 |
120246 | 11 | PAYMENT | 2041.31 | 0.0 | 0.0 | 0 | 0 | 1 |
125493 | 11 | PAYMENT | 16934.00 | 0.0 | 0.0 | 0 | 0 | 1 |
131202 | 11 | PAYMENT | 2082.03 | 0.0 | 0.0 | 0 | 0 | 1 |
136291 | 11 | PAYMENT | 2869.39 | 0.0 | 0.0 | 0 | 0 | 1 |
these transactions look fraudulent, but they are not.
money was payed, but there is no trace of it in either sending or receiving accounts
so we'll leave them. Remember, we have to be careful as to avoid deleting useful training information.
these do not look like duplicates, since the amount is different for each record. So we'll ignore
data.isnull().sum()
time_step 0 trans_type 0 trans_amt 0 new_orig_bal 0 new_dest_bal 0 isFraud 0 isFlaggedFraud 0 isMerchant 0 dtype: int64
# Drop the missing rows
# data = data.dropna()
df = data
X = df.drop('isFraud', axis=1)
y = df['isFraud']
#Defining colors for the plots
palette = ['#008080','#FF6347', '#E50000', '#D2691E']
palette2 = ['#FF6347', '#008080', '#E50000', '#D2691E']
l1 = list(data['isFraud'].value_counts())
pie_values = [l1[0] / sum(l1) * 100, l1[1] / sum(l1) * 100]
fig = plt.subplots(nrows = 1,ncols = 2,figsize = (20,7))
plt.subplot(1,2,1)
plt.pie(pie_values,labels = ['No Fraud','isFraud'],
autopct = '%1.2f%%',
explode = (0.1,0),
colors = palette,
wedgeprops = {'edgecolor': 'black','linewidth': 1, 'antialiased' : True})
plt.title('isFraud and No Fraud %');
plt.subplot(1,2,2)
ax = sns.countplot(data = data,
x='isFraud',
palette = palette,
edgecolor = 'black')
for i in ax.containers:
ax.bar_label(i,)
ax.set_xticklabels(['No Fraud','isFraud'])
plt.title('isFraud and No Fraud')
plt.show()
Dataset is imbalanced. Fraudulent cases are only 0.13% of our data.
This means that a blind guess (on "No Fraud") would give us accuracy of 99%
Therefore, we can't use Accuracy Score to choose our model
what to do to solve this issue:
#Oversampling the minority class (the churn customers)
ros = RandomOverSampler(random_state=0)
X_new,y_new= ros.fit_resample(X, y)
# X_new,y_new= X, y
print("After Random Over Sampling Of Minor Class Total Samples are :", len(y_new))
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_new)))
After Random Over Sampling Of Minor Class Total Samples are : 12708814 Original dataset shape Counter({0: 6354407, 1: 8213}) Resampled dataset shape Counter({0: 6354407, 1: 6354407})
Now our data is balanced
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size = 0.20, random_state = 42)
X_train.shape, X_test.shape , y_train.shape, y_test.shape
((10167051, 7), (2541763, 7), (10167051,), (2541763,))
numeric = [col for col in data.columns if data[col].dtype in ['int64', 'float64']]
numeric.remove('isFraud')
print("numeric_cols:", numeric)
print("categoric_cols:", categoric)
categoric = [col for col in data.columns if data[col].dtype == 'object']
numeric_cols: ['time_step', 'trans_amt', 'new_orig_bal', 'new_dest_bal', 'isFlaggedFraud', 'isMerchant'] categoric_cols: ['trans_type']
# calling our encoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False, drop="first")
# setting output to DataFrame
encoder.set_output(transform="pandas")
# encoding our data
X_catEncoded_train = encoder.fit_transform(X_train[categoric])
X_catEncoded_test = encoder.transform(X_test[categoric]) # encoding our test data
Scaler = StandardScaler().fit(X_train[numeric]).set_output(transform="pandas")
X_numScaled_train = Scaler.transform(X_train[numeric])
X_numScaled_test = Scaler.transform(X_test[numeric])
X_train = pd.concat([X_numScaled_train, X_catEncoded_train], axis=1)
X_test = pd.concat([X_numScaled_test, X_catEncoded_test], axis=1)
model= LogisticRegression()
model=model.fit(X_train, y_train)
pred = model.predict(X_test)
prob = model.predict_proba(X_test)[:,1]
r_lgt= recall_score(y_test, pred)
print("recall_score : ", r_lgt)
p_lgt= precision_score(y_test, pred)
print("precision_score :",p_lgt)
f1_lgt= f1_score(y_test, pred)
print("f1_score :", f1_lgt)
f2_lgt = fbeta_score(y_test, pred, beta=2, average='binary')
print("f2_score :", f2_lgt)
A_lgt= accuracy_score(pred, y_test)
print("accuracy_score :",A_lgt)
acu_lgt = roc_auc_score(pred, y_test)
print("ROC_AUC Score:",acu_lgt)
recall_score : 0.8822669455184645 precision_score : 0.804040798063354 f1_score : 0.8413394550069743 f2_score : 0.8654272430336448 accuracy_score : 0.8336154865736892 ROC_AUC Score: 0.8368043531734168
# plot the model evaluation
fpr, tpr, _ = roc_curve(y_test, prob)
fig, ax = plt.subplots(figsize=(10,7))
plt.title('Logistic Regression ROC curve')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.plot(fpr,tpr)
plt.plot((0,1), linestyle="--",color='black')
plt.show()
XG_model= XGBClassifier()
XG_model= XG_model.fit(X_train, y_train)
XG_pred = XG_model.predict(X_test)
XG_prob = XG_model.predict_proba(X_test)[:,1]
r_XG= recall_score(y_test, XG_pred)
print("recall_score : ", r_XG)
p_XG= precision_score(y_test, XG_pred)
print("precision_score :",p_XG)
f1_XG= f1_score(y_test, XG_pred)
print("f1_score :", f1_XG)
f2_XG = fbeta_score(y_test, XG_pred, beta=2, average='binary')
print("f2_score :", f2_XG)
A_XG= accuracy_score( y_test, XG_pred)
print("accuracy_score :",A_XG)
acu_XG = roc_auc_score(XG_pred, y_test)
print("ROC_AUC Score:",acu_XG)
recall_score : 0.9646777231719235 precision_score : 0.9752736020007668 f1_score : 0.9699467256107884 f2_score : 0.9667784397669578 accuracy_score : 0.9701089361990083 ROC_AUC Score: 0.9701642696785007
# plot the model evaluation
fpr, tpr, _ = roc_curve(y_test, XG_prob)
fig, ax = plt.subplots(figsize=(10,7))
plt.title('XGBoost ROC curve')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.plot(fpr,tpr)
plt.plot((0,1), linestyle="--",color='black')
plt.show()
dtmodel = DecisionTreeClassifier()
dtmodel = dtmodel.fit(X_train, y_train)
dt_pred = dtmodel.predict(X_test)
dt_prob = dtmodel.predict_proba(X_test)[:,1]
r_dt= recall_score(y_test, dt_pred)
print("recall_score : ", r_dt)
p_dt= precision_score(y_test, dt_pred)
print("precision_score :",p_dt)
f1_dt= f1_score(y_test, dt_pred)
print("f1_score :", f1_dt)
f2_dt = fbeta_score(y_test, dt_pred, beta=2, average='binary')
print("f2_score :", f2_dt)
A_dt= accuracy_score( y_test, dt_pred)
print("accuracy_score :", A_dt)
acu_dt = roc_auc_score(dt_pred, y_test)
print("ROC_AUC Score:",acu_dt)
recall_score : 1.0 precision_score : 0.9996554881214192 f1_score : 0.9998277143834889 f2_score : 0.9999310786288991 accuracy_score : 0.9998276786624087 ROC_AUC Score: 0.9998277440607096
# plot the model evaluation
fpr, tpr, _ = roc_curve(y_test, dt_prob)
fig, ax = plt.subplots(figsize=(10,7))
plt.title('dt ROC curve')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.plot(fpr,tpr)
plt.plot((0,1), linestyle="--",color='black')
plt.show()
False Negatives:
False negatives occur when the model incorrectly predicts that a patient does not have the disease when, in reality, they do. The consequence of a false negative is that the patient may not receive necessary treatment or intervention, potentially leading to undetected or untreated health issues. This can result in the progression of the disease, worsening health outcomes, and possibly even life-threatening situations if the disease is serious.
Given that false negatives are more costly in this medical diagnosis classification project, and I've balanced the classes using Random Over Sampler, it's advisable to prioritize metrics that focus on minimizing false negatives.
In this context, the most appropriate metric to use would be recall or the F2 score. Both of these metrics emphasize the minimization of false negatives, making them suitable for scenarios where the cost of missing positive cases (i.e., false negatives) is high.
compare_models = ['Logistic Regression', 'XGBClassifier', 'DecisionTreeClassifier']
data = {
'Accuracy': [A_lgt, A_XG, A_dt],
'Recall': [r_lgt, r_XG, r_dt],
'Precision': [p_lgt, p_XG, p_dt],
'f1_score': [f1_lgt, f1_XG, f1_dt],
'f2_score': [f2_lgt, f2_XG, f2_dt],
'ROC_AUC': [acu_lgt, acu_XG, acu_dt],
'Description': ['', 'best model', '']
}
result=pd.DataFrame(data=data, index=compare_models)
result
Accuracy | Recall | Precision | f1_score | f2_score | ROC_AUC | Description | |
---|---|---|---|---|---|---|---|
Logistic Regression | 0.833615 | 0.882267 | 0.804041 | 0.841339 | 0.865427 | 0.836804 | |
XGBClassifier | 0.970109 | 0.964678 | 0.975274 | 0.969947 | 0.966778 | 0.970164 | best model |
DecisionTreeClassifier | 0.999828 | 1.000000 | 0.999655 | 0.999828 | 0.999931 | 0.999828 |
In a fraud detection project, it is generally more important to prioritize false negatives over false positives. This is because false negatives mean that fraudulent transactions go undetected, leading to potential financial losses and other negative consequences.
False positives, while inconvenient, do not result in direct financial loss and can be managed with additional verification steps.
Given this context, the best metrics to focus on are Recall and f2_score. Recall measures the ability to identify actual fraud cases correctly (minimizing false negatives), while the f2_score places more emphasis on recall compared to precision.
Based on the above metrics, decisionTree Classifier is the best model for this fraud detection project as it has the highest recall (0.99) and a high f2_score (0.99), indicating it is effective in detecting fraudulent transactions while minimizing false negatives.
In the context of fraud detection, ROC_AUC is important because it gives a holistic view of the model's performance across all classification thresholds.
Based on the ROC_AUC values and previously discussed metrics (recall and f2_score), decisionTree Classifier stands out as the best model for this fraud detection project with the highest ROC_AUC (0.99).
dt_fold = KFold(n_splits = 5,shuffle=True)
# Define custom F2 scorer
f2_scorer = make_scorer(fbeta_score, beta=2)
score = cross_val_score(dtmodel, X_train, y_train, cv=dt_fold, scoring=f2_scorer, error_score="raise")
dt_cv_score = score.mean()
dt_cv_stdev = stdev(score)
print('Cross Validation f2 scores are: {}'.format(score))
print('Average Cross Validation f2 score: ', dt_cv_score)
print('Cross Validation f2 standard deviation: ', dt_cv_stdev)
Cross Validation f2 scores are: [0.9999129 0.99990826 0.99991209 0.99991915 0.99991113] Average Cross Validation f2 score: 0.9999127046617234 Cross Validation f2 standard deviation: 4.007776255302826e-06
after evaluation with KFold cross validation,
our best model maintains its high performance
# with open('model.pkl', 'wb') as f:
# pickle.dump(dt_tuned, f)
fig, ax = plt.subplots(figsize=(10,7))
y_pred_dt = dtmodel.predict(X_test)
cm = confusion_matrix(y_test, y_pred_dt, labels=dtmodel.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=dtmodel.classes_)
disp.plot(ax=ax)
plt.show()
Based on confusion matrix:
fimp = pd.Series(data=dtmodel.feature_importances_, index=X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(17,13))
plt.title("Feature importance")
ax = sns.barplot(y=fimp.index, x=fimp.values, palette=palette, orient='h')
What are the key factors that predict fraudulent customer?
Based on the feature importance, the key factors are:
Do these factors make sense? If yes, How? If not, How not?
Yes, these factors make sense. Large and unusual changes in balances, significant transaction amounts, and certain transaction types are common indicators of fraudulent activity. The timing of transactions can also reveal suspicious patterns.
What kind of prevention should be adopted while the company updates its infrastructure?
The company should adopt robust security measures, such as:
Assuming these actions have been implemented, how would you determine if they work?
To determine if the implemented actions are effective, you should: