In [ ]:

!pip install pandas_profiling

In [ ]:

!pip install lightgbm

In [ ]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import ipaddress
import pandas_profiling as pp
%matplotlib inline
from sklearn import preprocessing
plt.rc("font", size=14)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import warnings
warnings.filterwarnings("ignore")
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import LGBMClassifier
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [ ]:

import types
import pandas as pd

url = 'https://raw.githubusercontent.com/IBM/predict-fraud-using-auto-ai/master/data/fraud_dataset.csv'
df = pd.read_csv(url)

print(df.head())
print(df.shape)

In [ ]:

count_fraud = len(df[df['Fraud_Risk']==0])
count_non_fraud = len(df[df['Fraud_Risk']==1])
pct_of_non_fraud = count_non_fraud/(count_non_fraud +count_fraud)
print("percentage of non Fraud Risk is", round(pct_of_non_fraud*100,2))
pct_of_fraud = count_fraud/(count_non_fraud +count_fraud)
print("percentage of Fraud Risk", round(pct_of_fraud*100,2))

In [ ]:

sns.countplot(x='Fraud_Risk',data=df, palette='hls')
plt.show()

In [ ]:

df.groupby('Fraud_Risk').mean()

In [ ]:

df.corr(method ='pearson')

In [ ]:

X = df[df.columns[0:12]]
y = df[df.columns[12:]]

In [ ]:

df.dtypes

In [ ]:

df.isna()

In [ ]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [ ]:

print("Train_x Shape :: ", X_train.shape)
print("Train_y Shape :: ", y_train.shape)
print("Test_x Shape :: ", X_test.shape)
print("Test_y Shape :: ", y_test.shape)

In [ ]:

d_train = lgb.Dataset(X_train, label=y_train)

In [ ]:

def LGBM_classifier(features, target):
    """
    To train the LGBM classifier with features and target data
    :param features:
    :param target:
    :return: trained LGBM classifier
    """
    model = LGBMClassifier(metric='binary_logloss', objective='binary')
    model.fit(features, target)
    return model

start = time.time()
trained_model = LGBM_classifier(X_train, y_train.values.ravel())
print("> Completion Time : ", time.time() - start)
print("Trained LGBM model :: ", trained_model)
predictions = trained_model.predict(X_test)

In [ ]:

print("Train Accuracy :: ", accuracy_score(y_train, trained_model.predict(X_train)))
print("LGBM Model Test Accuracy is :: ", accuracy_score(y_test, predictions))

In [ ]:

print(" Confusion matrix ", confusion_matrix(y_test, predictions))

In [ ]:

feat_imp = pd.Series(trained_model.feature_importances_, index=X.columns)
feat_imp.nlargest(12).plot(kind='barh', figsize=(8,10))

In [ ]:

!pip install shap

In [ ]:

import shap
shap.initjs()

In [ ]:

shap_values = shap.TreeExplainer(trained_model.booster_).shap_values(X_train)