This notebook accompanies the Toptal blog found here.
# Load libraries and check memory
import psutil ; print(list(psutil.virtual_memory())[0:2])
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import xgboost as xgb
import pickle
import gc
gc.collect()
print(list(psutil.virtual_memory())[0:2])
# Load custom functions
import GAN_171103
# For reloading after making changes
import importlib
importlib.reload(GAN_171103)
from GAN_171103 import *
# Load engineered dataset from EDA section
data = pickle.load(open('data/' + 'credicard.engineered.pkl','rb'))
# data columns will be all other columns except class
data_cols = list(data.columns[ data.columns != 'Class' ])
label_cols = ['Class']
print(data_cols)
print('# of data columns: ',len(data_cols))
# Put columns in order of importance for xgboost fraud detection (from that section)
# sorted_cols = ['V14', 'V4', 'V12', 'V10', 'V26', 'V17', 'Amount', 'V7', 'V21', 'V28', 'V20', 'V3', 'V18', 'V8', 'V13', 'V22', 'V16', 'V11', 'V19', 'V27', 'V5', 'V6', 'V25', 'V15', 'V24', 'V9', 'V1', 'V2', 'V23', 'Class']
# sorted_cols = ['V14', 'V4', 'V12', 'V10', 'Amount', 'V26', 'V17', 'Time', 'V7', 'V28', 'V21', 'V19', 'V8', 'V3', 'V22', 'V20', 'V25', 'V11', 'V6', 'V16', 'V27', 'V5', 'V18', 'V9', 'V1', 'V2', 'V15', 'V23', 'V24', 'V13', 'Class']
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'Time', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
data = data[ sorted_cols ].copy()
# Add KMeans generated classes to fraud data - see classification section for more details on this
import sklearn.cluster as cluster
train = data.loc[ data['Class']==1 ].copy()
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train[ data_cols ])
print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
fraud_w_classes = train.copy()
fraud_w_classes['Class'] = labels
# Function to create toy spiral dataset (looks like swiss roll)
def create_toy_spiral_df( n, seed=0):
np.random.seed(seed)
toy = np.array([ [ (i/10+1) * np.sin(i), -(i/10+1) * np.cos(i) ] for i in np.random.uniform(0,3*np.pi,size=n) ])
toy = pd.DataFrame( toy, columns=[ ['v'+str(i+1) for i in range(2)] ])
return toy
# toy = create_toy_spiral_df(1000)
# plt.scatter( toy['v1'], toy['v2'] ) ;
# Function to create toy dataset of multiple groups of normal distributions in n dimensions
def create_toy_df( n, n_dim, n_classes, seed=0):
toy = pd.DataFrame(columns=[ ['v'+str(i+1) for i in range(n_dim)] + ['Class'] ])
toy_cols = toy.columns
np.random.seed(seed)
for class0 in range(n_classes):
center0s = np.random.randint(-10,10,size=n_dim)/10
var0s = np.random.randint(1,3,size=n_dim)/10
temp = np.array([[class0]]*n)
for dim0 in range(n_dim):
temp = np.hstack( [np.random.normal(center0s[dim0],var0s[dim0],n).reshape(-1,1), temp] )
toy = pd.concat([toy,pd.DataFrame(temp,columns=toy_cols)],axis=0).reset_index(drop=True)
return toy
# toy = create_toy_df(n=1000,n_dim=2,n_classes=2,seed=0)
# plt.scatter(toy[toy.columns[0]],toy[toy.columns[1]],c=toy['Class'], alpha=0.2) ;
# Load the credit card data
# Original data available from:
# https://www.kaggle.com/dalpozz/creditcardfraud
data = pd.read_csv("data/creditcard.csv.zip")
print(data.shape)
print(data.columns)
data.head(3)
# data columns will be all other columns except class
label_cols = ['Class']
data_cols = list(data.columns[ data.columns != 'Class' ])
print(data_cols)
print('# of data columns: ',len(data_cols))
# 284315 normal transactions (class 0)
# 492 fraud transactions (class 1)
data.groupby('Class')['Class'].count()
# Total nulls in dataset (sum over rows, then over columns)
data.isnull().sum().sum()
# Duplicates? Yes
normal_duplicates = sum( data.loc[ data.Class==0 ].duplicated() )
fraud_duplicates = sum( data.loc[ data.Class==1 ].duplicated() )
total_duplicates = normal_duplicates + fraud_duplicates
print( 'Normal duplicates', normal_duplicates )
print( 'Fraud duplicates', fraud_duplicates )
print( 'Total duplicates', total_duplicates )
print( 'Fraction duplicated', total_duplicates / len(data) )
# 'Time' is seconds from first transaction in set
# 48 hours worth of data
# Let's convert time to time of day, in hours
print( 'Last time value: {:.2f}'.format( data['Time'].max() / 3600 ) )
data['Time'] = ( data['Time'].values / 3600 ) % 24
plt.hist( [ data.loc[ data['Class']==0, 'Time'], data.loc[ data['Class']==1, 'Time'] ],
normed=True, label=['normal','fraud'], bins=np.linspace(0,24,25))
plt.legend()
plt.show()
# Looks like normal transactions have a bias towards 8am to midnight
# Fraud has spikes at 2-3am and noon
# several columns heavily skewed, 'Amount' the highest (besides Class)
data.skew()
# Minimum 'Amount' is 0
# 0's account for 0.6% of the data set
print( data['Amount'].min() )
print( np.sum( data['Amount']==0 ) )
# print( np.sum( data['Amount']<0.01 ) )
print( np.sum( data['Amount']==0 ) / len(data) )
# Looks like all 'Amount' values are rounded to the hundredths (0.01) place
data['Amount'].mod(0.01).hist() ;
# Some values are much more frequent than others
# 0.00 comes in 12th in the list
print( data.Amount.value_counts().head(15) )
# Log transform amount values to give more normal distribution
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.hist(data['Amount'], bins=40)
plt.title('Original Amount Distribution')
plt.subplot(1,2,2)
d0 = np.log10( data['Amount'].values + 1 )
# d0 = np.log1p( data['Amount'].values ) / np.log(10)
plt.hist( d0, bins=40 )
plt.title('Log10(x+1) Transformed Amount Distribution')
plt.show()
# Use log transformed data
data['Amount'] = d0
# Center and scale all data, only using the middle 99.8%, so outliers don't pull too much.
# First generate the percentile data for each feature
percentiles = pd.DataFrame( np.array([ np.percentile( data[i], [ 0.1, 99.9 ] ) for i in data_cols ]).T,
columns=data_cols, index=['min','max'] )
percentile_means = \
[ [ np.mean( data.loc[ (data[i]>percentiles[i]['min']) & (data[i]<percentiles[i]['max']) , i ] ) ]
for i in data_cols ]
percentiles = percentiles.append( pd.DataFrame(np.array(percentile_means).T, columns=data_cols, index=['mean']) )
percentile_stds = \
[ [ np.std( data.loc[ (data[i]>percentiles[i]['min']) & (data[i]<percentiles[i]['max']) , i ] ) ]
for i in data_cols ]
percentiles = percentiles.append( pd.DataFrame(np.array(percentile_stds).T, columns=data_cols, index=['stdev']) )
percentiles
# Center and scale the data using the percentile data we just generated
data[data_cols] = ( data[data_cols] - percentiles.loc[ 'mean', data_cols ] ) / percentiles.loc[ 'stdev', data_cols ]
# # Or we can center and scale using all of the data
# from sklearn.preprocessing import StandardScaler
# data[data_cols] = StandardScaler().fit_transform(data[data_cols])
# There are outliers, 50-100 stdevs away from mean in several columns
plot_cols = data_cols
# plt.plot( np.log10( data[ plot_cols ].abs().max().values ) )
# plt.plot( data[ plot_cols ].abs().max().values / data[ plot_cols ].std().values / 10, label='max z/10' )
plt.plot( data.loc[ data.Class==1, plot_cols ].abs().max().values / data[ plot_cols ].std().values / 10, label='fraud max z/10' )
plt.plot( data.loc[ data.Class==0, plot_cols ].abs().max().values / data[ plot_cols ].std().values / 10, label='real max z/10' )
plt.plot( data[ plot_cols ].mean().values, label='mean' )
# plt.plot( data[ plot_cols ].abs().mean().values, label='abs mean' )
plt.plot( data[ plot_cols ].std().values, label='std' )
plt.xlabel('Feature')
plt.ylabel('z/10')
plt.legend() ;
# Check Correlations
# Note no correlations among PCA transformed columns, as expected
corr0 = data.corr()
plt.imshow(corr0) ;
# Looking at correlation values
# np.round(corr0[['Time','Amount','Class']],2)
# plt.imshow( np.round(corr0[['Time','Amount','Class']],2) ) ;
# corr0[data_cols]
# np.round(corr0[data_cols],1)
# np.round(corr0[data_cols],1)
# Plot the data by each feature
axarr = [[]]*len(data_cols)
columns = 4
rows = int( np.ceil( len(data_cols) / columns ) )
f, fig = plt.subplots( figsize=(columns*3.5, rows*2) )
f.suptitle('Data Distributions by Feature and Class', size=16)
for i, col in enumerate(data_cols[:]):
axarr[i] = plt.subplot2grid( (int(rows), int(columns)), (int(i//columns), int(i%columns)) )
axarr[i].hist( [ data.loc[ data.Class == 0, col ], data.loc[ data.Class == 1, col ] ], label=['normal','fraud'],
bins=np.linspace( np.percentile(data[col],0.1), np.percentile(data[col],99.9), 30 ),
normed=True )
axarr[i].set_xlabel(col, size=12)
axarr[i].set_ylim([0,0.8])
axarr[i].tick_params(axis='both', labelsize=10)
if i == 0:
legend = axarr[i].legend()
legend.get_frame().set_facecolor('white')
if i%4 != 0 :
axarr[i].tick_params(axis='y', left='off', labelleft='off')
else:
axarr[i].set_ylabel('Fraction',size=12)
plt.tight_layout(rect=[0,0,1,0.95]) # xmin, ymin, xmax, ymax
# plt.savefig('plots/Engineered_Data_Distributions.png')
plt.show()
# Save engineered dataset for use in analysis
# Save as pickle for faster reload
pickle.dump(data, open('data/' + 'credicard.engineered.pkl','wb'))
# # Save as csv for human readability - much slower save
# data.to_csv('data/' + 'credicard.engineered.csv.zip')
# define the columns we want to test on, in case we want to use less than the full set
test_cols = data.columns
# test_cols = data.columns[ data.columns != 'Amount' ]
print(len(test_cols))
print(test_cols)
# Define some custom metric functions for use with the xgboost algorithm
# https://github.com/dmlc/xgboost/blob/master/demo/guide-python/custom_objective.py
from sklearn.metrics import recall_score, precision_score, roc_auc_score
def recall(preds, dtrain):
labels = dtrain.get_label()
return 'recall', recall_score(labels, np.round(preds))
def precision(preds, dtrain):
labels = dtrain.get_label()
return 'precision', precision_score(labels, np.round(preds))
def roc_auc(preds, dtrain):
labels = dtrain.get_label()
return 'roc_auc', roc_auc_score(labels, preds)
# Set up the test and train sets
np.random.seed(0)
n_real = np.sum(data.Class==0) # 200000
n_test = np.sum(data.Class==1) # 492
train_fraction = 0.7
fn_real = int(n_real * train_fraction)
fn_test = int(n_test * train_fraction)
real_samples = data.loc[ data.Class==0, test_cols].sample(n_real, replace=False).reset_index(drop=True)
test_samples = data.loc[ data.Class==1, test_cols].sample(n_test, replace=False).reset_index(drop=True)
train_df = pd.concat([real_samples[:fn_real],test_samples[:fn_test]],axis=0,ignore_index=True).reset_index(drop=True)
# train_df = pd.concat([real_samples[:fn_test],test_samples[:fn_test]],axis=0,ignore_index=True).reset_index(drop=True)
test_df = pd.concat([real_samples[fn_real:],test_samples[fn_test:]],axis=0,ignore_index=True).reset_index(drop=True)
print( 'classes 0, 1: ', n_real, n_test )
print( 'train, test: ', len(train_df), len(test_df) )
X_col = test_df.columns[:-1]
y_col = test_df.columns[-1]
dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
dtest = xgb.DMatrix(test_df[X_col], test_df[y_col], feature_names=X_col)
# Run the xgboost algorithm, maximize recall on the test set
results_dict = {}
xgb_params = {
# 'max_depth': 4,
'objective': 'binary:logistic',
'random_state': 0,
'eval_metric': 'auc', # auc, error
# 'tree_method': 'hist'
# 'grow_policy': 'lossguide' # depthwise, lossguide
}
xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100,
verbose_eval=False,
early_stopping_rounds=20,
evals=[(dtrain,'train'),(dtest,'test')],
evals_result = results_dict,
feval = recall, maximize=True
# feval = roc_auc, maximize=True
)
y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)
y_true = test_df['Class'].values
print( 'best iteration: ', xgb_test.best_iteration )
print( recall( y_pred, dtest ) )
print( precision( y_pred, dtest ) )
print( roc_auc( y_pred, dtest ) )
# print( 'Accuracy: {:.3f}'.format(SimpleAccuracy(y_pred, y_true)) )
SimpleMetrics( np.round(y_pred), y_true)
# Let's look at how the metrics changed on the train and test sets as more trees were added
for i in results_dict:
for err in results_dict[i]:
plt.plot(results_dict[i][err], label=i+' '+err)
plt.axvline(xgb_test.best_iteration, c='green', label='best iteration')
plt.xlabel('iteration')
# plt.ylabel(err)
plt.title('xgboost learning curves')
plt.legend()
plt.grid() ;
# Plot feature importances
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
xgb.plot_importance(xgb_test, max_num_features=10, height=0.5, ax=ax);
# Generate list of features sorted by importance in detecting fraud
# https://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
import operator
x = xgb_test.get_fscore()
sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)
# print( 'Top eight features for fraud detection: ', [ i[0] for i in sorted_x[:8] ] )
sorted_cols = [i[0] for i in sorted_x] + ['Class']
print( sorted_cols )
# Plot all of the training data with paired features sorted by importance
# This takes a while
colors = ['red','blue']
markers = ['o','^']
labels = ['real','fraud']
alphas = [0.7, 0.9]
columns = 4
rows = int( np.ceil( len(data_cols) / columns / 2 ) )
plt.figure( figsize=(columns*3.5, rows*3) )
plt.suptitle('XGBoost Sorted Data Distributions ', size=16)
train = train_df.copy()
for i in range( int(np.floor(len(sorted_x)/2)) )[:]:
col1, col2 = sorted_x[i*2][0], sorted_x[i*2+1][0]
# print(i,col1,col2)
plt.subplot(rows,columns,i+1)
for group, color, marker, label, alpha in zip( train.groupby('Class'), colors, markers, labels, alphas ):
plt.scatter( group[1][col1], group[1][col2],
label=label, marker=marker, alpha=alpha,
edgecolors=color, facecolors='none' )
plt.xlabel(col1, size=12)
plt.ylabel(col2, size=12)
plt.tick_params(axis='both', labelsize=10)
if i == 0: plt.legend(fontsize=12, edgecolor='black')
plt.tight_layout(rect=[0,0,1,0.95]) # xmin, ymin, xmax, ymax
# plt.savefig('plots/XGB_Sorted_Data_Distributions.png')
plt.show()
# Lets look at the effect of the ratio of normal:fraud data in the dataset on recall and roc_auc
# We'll use cross validation to see if differences are significant
np.random.seed(0)
n_real = np.sum(data.Class==0) # 200000
n_test = np.sum(data.Class==1) # 492
real_samples = data.loc[ data.Class==0, test_cols].sample(n_real, replace=False).reset_index(drop=True)
test_samples = data.loc[ data.Class==1, test_cols].sample(n_test, replace=False).reset_index(drop=True)
X_col = data.columns[:-1]
y_col = data.columns[-1]
test_data=[]
# for i in [1]:
# for i in [0.1,0.5,1,2,10]:
for i in np.logspace(-1,2,8):
print(i)
train_df = pd.concat([real_samples[:int(n_test*i)],test_samples[:n_test]],axis=0,ignore_index=True).reset_index(drop=True)
dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
results = xgb.cv(xgb_params, dtrain,
nfold=5, num_boost_round=100, early_stopping_rounds=10, seed=0,
feval=recall)
test_data.append(list([i]) + list(results.tail(1).index) + list(results.tail(1).values[0]))
test_data = pd.DataFrame(test_data, columns=list(['ratio','best'])+list(results.columns))
test_data
# Recall decreases as more normal data is added
# metric = 'auc'
metric = 'recall'
# xs = test_data['ratio'].values
xs = np.log10(test_data['ratio'].values)
ys = test_data['test-'+metric+'-mean'].values
stds = test_data['test-'+metric+'-std'].values
plt.plot(xs,ys,c='C1')
plt.plot(xs,ys+stds,linestyle=':',c='C2')
plt.plot(xs,ys-stds,linestyle=':',c='C2')
plt.xlabel('log10 ratio of normal:fraud data')
plt.ylabel(metric)
# plt.ylim([0.96,1.01])
plt.show()
# load clustering libraries
import sklearn.cluster as cluster
# hdbscan not in kaggle/python at present
!pip install hdbscan
import hdbscan
# Set up training set to consist of only fraud data
train = data.loc[ data['Class']==1 ].copy()
print( pd.DataFrame( [ [np.sum(train['Class']==i)] for i in np.unique(train['Class']) ], columns=['count'], index=np.unique(train['Class']) ) )
# train = pd.get_dummies(train, columns=['Class'], prefix='Class')
label_cols = [ i for i in train.columns if 'Class' in i ]
data_cols = [ i for i in train.columns if i not in label_cols ]
train_no_label = train[ data_cols ]
%%time
# TSNE is an interesting method to map higher dimensional data into two dimensions
# http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html
# Note TSNE map may not be what you might think:
# https://distill.pub/2016/misread-tsne/
# Create multiple projections to compare results from different random states
from sklearn.manifold import TSNE
projections = [ TSNE(random_state=i).fit_transform(train_no_label) for i in range(3) ]
%%time
# Now we'll compare some different clustering algorithms
# https://github.com/scikit-learn-contrib/hdbscan/blob/master/docs/comparing_clustering_algorithms.rst
algorithms = [
# [ 'KMeans', cluster.KMeans, (), {'random_state':0} ],
[ 'KMeans', cluster.KMeans, (), {'n_clusters':2, 'random_state':0} ],
# [ 'KMeans 3', cluster.KMeans, (), {'n_clusters':3, 'random_state':0} ],
# [ 'Agglomerative', cluster.AgglomerativeClustering, (), {} ],
[ 'Agglomerative', cluster.AgglomerativeClustering, (), {'linkage': 'ward', 'n_clusters': 3} ],
# [ 'Agg. Ave 3', cluster.AgglomerativeClustering, (), {'linkage': 'average', 'n_clusters': 3} ],
# [ 'Agg. Complete 3', cluster.AgglomerativeClustering, (), {'linkage': 'complete', 'n_clusters': 3} ],
# [ 'DBSCAN', cluster.DBSCAN, (), {'eps':0.025} ],
# [ 'HDBSCAN', hdbscan.HDBSCAN, (), {} ],
[ 'HDBSCAN', hdbscan.HDBSCAN, (), {'min_cluster_size':10, 'min_samples':1, } ],
# [ 'HDBSCAN 2 10', hdbscan.HDBSCAN, (), {'min_cluster_size':2, 'min_samples':10, } ],
# [ 'HDBSCAN 10 10 ', hdbscan.HDBSCAN, (), {'min_cluster_size':10, 'min_samples':10, } ],
]
rows = len(algorithms)
columns = 4
plt.figure(figsize=(columns*3, rows*3))
for i, [name, algorithm, args, kwds] in enumerate(algorithms):
print(i, name)
labels = algorithm(*args, **kwds).fit_predict(train_no_label)
# labels = algorithm(*args, **kwds).fit_predict(projections[0])
# print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
colors = np.clip(labels,-1,9)
colors = [ 'C'+str(i) if i>-1 else 'black' for i in colors ]
plt.subplot(rows,columns,i*columns+1)
plt.scatter(train_no_label[data_cols[0]], train_no_label[data_cols[1]], c=colors)
plt.xlabel(data_cols[0]), plt.ylabel(data_cols[1])
plt.title(name)
for j in range(3):
plt.subplot(rows,columns,i*columns+1+j+1)
plt.scatter(*(projections[j].T), c=colors)
plt.xlabel('x'), plt.ylabel('y')
plt.title('TSNE projection '+str(j+1),size=12)
# break
plt.suptitle('Comparison of Fraud Clusters', size=16)
plt.tight_layout(rect=[0,0,1,0.95])
plt.savefig('plots/Fraud_Cluster_Diagram.png')
plt.show()