!pip install pandas_profiling
!pip install lightgbm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipaddress
import pandas_profiling as pp
%matplotlib inline
from sklearn import preprocessing
plt.rc("font", size=14)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import warnings
warnings.filterwarnings("ignore")
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import LGBMClassifier
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
import types
import pandas as pd
url = 'https://raw.githubusercontent.com/IBM/predict-fraud-using-auto-ai/master/data/fraud_dataset.csv'
df = pd.read_csv(url)
print(df.head())
print(df.shape)
count_fraud = len(df[df['Fraud_Risk']==0])
count_non_fraud = len(df[df['Fraud_Risk']==1])
pct_of_non_fraud = count_non_fraud/(count_non_fraud +count_fraud)
print("percentage of non Fraud Risk is", round(pct_of_non_fraud*100,2))
pct_of_fraud = count_fraud/(count_non_fraud +count_fraud)
print("percentage of Fraud Risk", round(pct_of_fraud*100,2))
sns.countplot(x='Fraud_Risk',data=df, palette='hls')
plt.show()
df.groupby('Fraud_Risk').mean()
df.corr(method ='pearson')
X = df[df.columns[0:12]]
y = df[df.columns[12:]]
df.dtypes
df.isna()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print("Train_x Shape :: ", X_train.shape)
print("Train_y Shape :: ", y_train.shape)
print("Test_x Shape :: ", X_test.shape)
print("Test_y Shape :: ", y_test.shape)
d_train = lgb.Dataset(X_train, label=y_train)
def LGBM_classifier(features, target):
"""
To train the LGBM classifier with features and target data
:param features:
:param target:
:return: trained LGBM classifier
"""
model = LGBMClassifier(metric='binary_logloss', objective='binary')
model.fit(features, target)
return model
start = time.time()
trained_model = LGBM_classifier(X_train, y_train.values.ravel())
print("> Completion Time : ", time.time() - start)
print("Trained LGBM model :: ", trained_model)
predictions = trained_model.predict(X_test)
print("Train Accuracy :: ", accuracy_score(y_train, trained_model.predict(X_train)))
print("LGBM Model Test Accuracy is :: ", accuracy_score(y_test, predictions))
print(" Confusion matrix ", confusion_matrix(y_test, predictions))
feat_imp = pd.Series(trained_model.feature_importances_, index=X.columns)
feat_imp.nlargest(12).plot(kind='barh', figsize=(8,10))
!pip install shap
import shap
shap.initjs()
shap_values = shap.TreeExplainer(trained_model.booster_).shap_values(X_train)