#!/usr/bin/env python # coding: utf-8 # # Imbalanced Learning # ### Artículo completo en www.aprendemachinelearning.com # ## Manejo de clases desbalanceadas con la librería Python ImbLearn # No olvides instalar con:

pip install -U imbalanced-learn # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA from sklearn.tree import DecisionTreeClassifier from pylab import rcParams from imblearn.under_sampling import NearMiss from imblearn.over_sampling import RandomOverSampler from imblearn.combine import SMOTETomek from imblearn.ensemble import BalancedBaggingClassifier from collections import Counter #set up graphic style in this case I am using the color scheme from xkcd.com rcParams['figure.figsize'] = 14, 8.7 # Golden Mean LABELS = ["Normal","Fraud"] #col_list = ["cerulean","scarlet"]# https://xkcd.com/color/rgb/ #sns.set(style='white', font_scale=1.75, palette=sns.xkcd_palette(col_list)) get_ipython().run_line_magic('matplotlib', 'inline') # ## Dataset Credit Card Fraud Detection # Descarga de Kaggle en https://www.kaggle.com/mlg-ulb/creditcardfraud/data # In[2]: df = pd.read_csv("creditcard.csv") # read in data downloaded to the local directory df.head(n=5) #just to check you imported the dataset properly4 # In[3]: df.shape #secondary check on the size of the dataframe # In[4]: pd.value_counts(df['Class'], sort = True) #class comparison 0=Normal 1=Fraud # In[5]: #if you don't have an intuitive sense of how imbalanced these two classes are, let's go visual count_classes = pd.value_counts(df['Class'], sort = True) count_classes.plot(kind = 'bar', rot=0) plt.xticks(range(2), LABELS) plt.title("Frequency by observation number") plt.xlabel("Class") plt.ylabel("Number of Observations"); # In[6]: normal_df = df[df.Class == 0] #save normal_df observations into a separate df fraud_df = df[df.Class == 1] #do the same for frauds # In[7]: #plot of high value transactions bins = np.linspace(200, 2500, 100) plt.hist(normal_df.Amount, bins, alpha=1, normed=True, label='Normal') plt.hist(fraud_df.Amount, bins, alpha=0.6, normed=True, label='Fraud') plt.legend(loc='upper right') plt.title("Amount by percentage of transactions (transactions \$200+)") plt.xlabel("Transaction amount (USD)") plt.ylabel("Percentage of transactions (%)"); plt.show() # In[8]: bins = np.linspace(0, 48, 48) #48 hours plt.hist((normal_df.Time/(60*60)), bins, alpha=1, normed=True, label='Normal') plt.hist((fraud_df.Time/(60*60)), bins, alpha=0.6, normed=True, label='Fraud') plt.legend(loc='upper right') plt.title("Percentage of transactions by hour") plt.xlabel("Transaction time as measured from first transaction in the dataset (hours)") plt.ylabel("Percentage of transactions (%)"); #plt.hist((df.Time/(60*60)),bins) plt.show() # In[9]: plt.scatter((normal_df.Time/(60*60)), normal_df.Amount, alpha=0.6, label='Normal') plt.scatter((fraud_df.Time/(60*60)), fraud_df.Amount, alpha=0.9, label='Fraud') plt.title("Amount of transaction by hour") plt.xlabel("Transaction time as measured from first transaction in the dataset (hours)") plt.ylabel('Amount (USD)') plt.legend(loc='upper right') plt.show() # In[10]: y = df['Class'] X = df.drop('Class', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) # In[31]: # Reduce dataset to 2 feature dimensions in order to visualize the data pca = PCA(n_components=2) pca.fit(X) X_reduced = pca.transform(X) fig, ax = plt.subplots(1, 2, figsize= (15,5)) ax[0].scatter(X_reduced[y == 0, 0], X_reduced[y == 0, 1], label="Normal", alpha=0.2) ax[0].scatter(X_reduced[y == 1, 0], X_reduced[y == 1, 1], label="Fraude", alpha=0.2) ax[0].set_title('PCA of original dataset') ax[0].legend() ax[1] = sns.countplot(y) ax[1].set_title('Number of observations per class') # In[12]: def run_model(X_train, X_test, y_train, y_test): clf_base = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg") clf_base.fit(X_train, y_train) return clf_base # # Modelo sin balancear # In[13]: model = run_model(X_train, X_test, y_train, y_test) # In[14]: def mostrar_resultados(y_test, pred_y): conf_matrix = confusion_matrix(y_test, pred_y) plt.figure(figsize=(8, 8)) sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d"); plt.title("Confusion matrix") plt.ylabel('True class') plt.xlabel('Predicted class') plt.show() print (classification_report(y_test, pred_y)) # In[15]: pred_y = model.predict(X_test) mostrar_resultados(y_test, pred_y) # # 1 Estrategia: Penalización # In[16]: def run_model_balanced(X_train, X_test, y_train, y_test): clf = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg",class_weight="balanced") clf.fit(X_train, y_train) return clf model = run_model_balanced(X_train, X_test, y_train, y_test) # In[17]: pred_y = model.predict(X_test) mostrar_resultados(y_test, pred_y) # # 2 NearMiss subsampling del grupo mayoritario # In[18]: us = NearMiss(ratio=0.5, n_neighbors=3, version=2, random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) print ("Distribution of class labels before resampling {}".format(Counter(y_train))) print ("Distribution of class labels after resampling {}".format(Counter(y_train_res))) # In[19]: model = run_model(X_train_res, X_test, y_train_res, y_test) # In[20]: pred_y = model.predict(X_test) mostrar_resultados(y_test, pred_y) # # 3 Random Oversampling # In[21]: os = RandomOverSampler(ratio=0.5) X_train_res, y_train_res = os.fit_sample(X_train, y_train) print ("Distribution of class labels before resampling {}".format(Counter(y_train))) print ("Distribution of class labels after resampling {}".format(Counter(y_train_res))) # In[22]: model = run_model(X_train_res, X_test, y_train_res, y_test) # In[23]: pred_y = model.predict(X_test) mostrar_resultados(y_test, pred_y) # # 4 Combinando Smote tomek # In[24]: os_us = SMOTETomek(ratio=0.5) X_train_res, y_train_res = os_us.fit_sample(X_train, y_train) print ("Distribution of class labels before resampling {}".format(Counter(y_train))) print ("Distribution of class labels after resampling {}".format(Counter(y_train_res))) # In[25]: model = run_model(X_train_res, X_test, y_train_res, y_test) # In[26]: pred_y = model.predict(X_test) mostrar_resultados(y_test, pred_y) # # 5 Ensemble balanceado # In[27]: #Create an object of the classifier. bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False, random_state=0) #Train the classifier. bbc.fit(X_train, y_train) # In[28]: pred_y = bbc.predict(X_test) mostrar_resultados(y_test, pred_y) # # Resultados # In[30]: df = pd.DataFrame({'algorithm' : ['Regresion Logística', 'Penalizacion', 'NearMiss Subsampling', 'Random Oversampling', 'Smote Tomek', 'Ensemble'], 'precision' : [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 'recall' : [0.66, 0.93, 0.93, 0.89, 0.85, 0.88]}) df['overall'] = df.apply(lambda row: (row.precision + row.recall)/2, axis=1) df = df.sort_values(['overall'], ascending=False) df # In[ ]: