! pip install -U scikit-learn ! pip install -U xgboost ! pip install -U lightgbm ! pip install catboost ! pip install -U imbalanced-learn import numpy as np import pandas as pd import time from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn import metrics from xgboost import cv as XGB_CV from xgboost import DMatrix #plotting from matplotlib import pyplot as plt import seaborn as sns from IPython.display import clear_output # cluster analysis from sklearn.cluster import KMeans from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer # models from sklearn.dummy import DummyClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import AdaBoostClassifier from xgboost import XGBClassifier from lightgbm import LGBMClassifier from catboost import CatBoostClassifier from sklearn.ensemble import StackingClassifier, VotingClassifier # processing pipeline from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE sns.set_theme() bank=pd.read_parquet('santander_train.parquet') data=bank.copy() bank.shape bank.sample(10,random_state=1) bank.info() # null check bank.isna().sum().sum() # duplicate row check bank.duplicated().sum() bank['TARGET'].value_counts(normalize=True) # find constant columns const_col=[] for col in bank.columns: if bank[col].std()==0: const_col.append(col) # find duplicate columns dup_bool=bank.T.duplicated() dups=[] for idx in dup_bool.index: if dup_bool[idx]==True: dups.append(idx) remove=const_col+dups print(f'There are {len(remove)} columns to remove.') X=bank.drop(['ID','TARGET']+remove,axis=1) y=bank['TARGET'] # split into training and validation sets X_train,X_val,y_train,y_val=train_test_split( X, y, test_size=0.3, stratify=y, random_state=57 ) scaler=StandardScaler().set_output(transform='pandas') X_ts=scaler.fit_transform(X_train) X_ts.describe().T.head() print(f'Number of features: {X_ts.shape[1]}.') # major reduction test pca37=PCA(n_components=37) pca37.fit(X_ts) plt.title('Cumulative variance explained by eigenvectors',fontsize=15) plt.step( np.arange(1,38), np.cumsum(pca37.explained_variance_ratio_), where='mid' ) plt.xlabel('Number of Eigenvectors') plt.ylabel('Cumulative Variance'); # about 1/3 the size pca123=PCA(n_components=123) pca123.fit(X_ts) evr=pca123.explained_variance_ratio_ plt.title(f'Cumulative explained variance (reaches {np.round(sum(evr)*100,2)}%)',fontsize=15) plt.step( np.arange(1,124), np.cumsum(evr), where='mid' ) plt.xlabel('Number of Eigenvectors') plt.ylabel('Cumulative Variance'); # preprocessing pipe pre=Pipeline( steps=[ ('Scaler',StandardScaler()), ('Dimension_Reduction',PCA(n_components=123)) ] ).set_output(transform='pandas') X_ts=pre.fit_transform(X_train) X_vs=pre.transform(X_val) a=X_ts.memory_usage().sum()/X_train.memory_usage().sum() print(f'Memory usage reduced to {np.round(a*100,2)}% of original data frame.') y_train.value_counts(normalize=True) smote=SMOTE( sampling_strategy='not majority', random_state=1, k_neighbors=5 ) # oversampled training data Xt_over,yt_over=smote.fit_resample(X_ts,y_train) # re-scale data Xt_over=scaler.fit_transform(Xt_over) yt_over.value_counts(normalize=True) def get_scores(model,sample=None,output=None): '''Collect model scores.''' # define training data if sample=='over': X_t=Xt_over y_t=yt_over else: X_t=X_ts y_t=y_train # predictions y_t_hat=model.predict(X_t) y_v_hat=model.predict(X_vs) # collect scores train_scores=[ metrics.recall_score(y_t,y_t_hat), metrics.fbeta_score(y_t,y_t_hat,beta=2), metrics.f1_score(y_t,y_t_hat), metrics.roc_auc_score(y_t,y_t_hat), metrics.zero_one_loss(y_t,y_t_hat) ] val_scores=[ metrics.recall_score(y_val,y_v_hat), metrics.fbeta_score(y_val,y_v_hat,beta=2), metrics.f1_score(y_val,y_v_hat), metrics.roc_auc_score(y_val,y_v_hat), metrics.zero_one_loss(y_val,y_v_hat) ] # output scores in pandas df if output=='pandas': df=pd.DataFrame( [train_scores,val_scores], columns=[ 'Recall', 'F_beta', 'F1', 'AUC', '0-1_Loss' ], index=['train','val'] ) return df return [train_scores,val_scores] def confusion_heatmap(model,show_scores=True): '''Heatmap of confusion matrix for model performance on validation data.''' actual=y_val predicted=model.predict(X_vs) # generate confusion matrix cm=metrics.confusion_matrix(actual,predicted) cm=np.flip(cm).T # heatmap labels labels=['TP','FP','FN','TN'] cm_labels=np.array(cm).flatten() cm_percents=np.round((cm_labels/np.sum(cm))*100,3) annot_labels=[] for i in range(4): annot_labels.append(str(labels[i])+'\nCount:'+str(cm_labels[i])+'\n'+str(cm_percents[i])+'%') annot_labels=np.array(annot_labels).reshape(2,2) # print figure plt.figure(figsize=(8,5)) plt.title('Confusion Matrix (Validation Data)',fontsize=20) sns.heatmap(data=cm, annot=annot_labels, annot_kws={'fontsize':'x-large'}, xticklabels=[1,0], yticklabels=[1,0], cmap='Greens', fmt='s') plt.xlabel('Actual',fontsize=14) plt.ylabel('Predicted',fontsize=14) plt.tight_layout(); # scores if show_scores==True: scores=['Accuracy','Precision','Recall','F1'] score_list=[metrics.accuracy_score(actual,predicted), metrics.precision_score(actual,predicted), metrics.recall_score(actual,predicted), metrics.f1_score(actual,predicted)] df=pd.DataFrame(index=scores) df['Val. Scores']=score_list return df return # alias function name to something shorter ch=confusion_heatmap models=[ 'RandomForest', 'AdaBoost', 'XGBoost', 'LightGBM', 'CatBoost' ] datasets=['train','val'] # generate MultiIndex object mi=pd.MultiIndex.from_product( iterables=[models,datasets], names=['model','data'] ) # build comparison table tab=pd.DataFrame( columns=[ 'Recall', 'F_beta', 'F1', 'AUC', '0-1_Loss' ], index=mi ) d=DummyClassifier( strategy='stratified', random_state=1 ) d.fit(X_ts,y_train) ch(d) rf=RandomForestClassifier( random_state=1, n_jobs=-1 ) rf.fit(X_ts,y_train) tab.loc['RandomForest']=get_scores(rf) tab.loc['RandomForest'] ch(rf) abc=AdaBoostClassifier(random_state=1) abc.fit(X_ts,y_train) tab.loc['AdaBoost']=get_scores(abc) tab.loc['AdaBoost'] ch(abc) xgb=XGBClassifier( random_state=1 ) xgb.fit(X_ts,y_train) tab.loc['XGBoost']=get_scores(xgb) tab.loc['XGBoost'] ch(xgb) lg=LGBMClassifier() lg.fit(X_ts,y_train) tab.loc['LightGBM']=get_scores(lg) tab.loc['LightGBM'] ch(lg) cb=CatBoostClassifier() cb.fit(X_ts,y_train,verbose=False) tab.loc['CatBoost']=get_scores(cb) tab.loc['CatBoost'] ch(cb) tab # build comparison table tab_over=pd.DataFrame( columns=[ 'Recall', 'F_beta', 'F1', 'AUC', '0-1_Loss' ], index=mi ) rf_over=RandomForestClassifier( random_state=1, n_jobs=-1 ) rf_over.fit(Xt_over,yt_over) tab_over.loc['RandomForest']=get_scores(rf_over,sample='over') tab_over.loc['RandomForest'] ch(rf_over) abc_over=AdaBoostClassifier(random_state=1) abc_over.fit(Xt_over,yt_over) tab_over.loc['AdaBoost']=get_scores(abc_over,sample='over') tab_over.loc['AdaBoost'] ch(abc_over) xgb_over=XGBClassifier( tree_method='gpu_hist', random_state=1 ) xgb_over.fit(Xt_over,yt_over) tab_over.loc['XGBoost']=get_scores(xgb_over,sample='over') tab_over.loc['XGBoost'] ch(xgb_over) lg_over=LGBMClassifier( n_jobs=-1, random_state=1 ) lg_over.fit(Xt_over,yt_over) tab_over.loc['LightGBM']=get_scores(lg_over,sample='over') tab_over.loc['LightGBM'] ch(lg_over) cb_over=CatBoostClassifier( task_type='GPU', gpu_ram_part=0.9, gpu_cat_features_storage='GpuRam', random_seed=1 ) cb_over.fit(Xt_over,yt_over,verbose=False) tab_over.loc['CatBoost']=get_scores(cb_over,sample='over') tab_over.loc['CatBoost'] ch(cb_over) tab_over params={ 'n_estimators':np.arange(50,251,50), 'learning_rate':[0.5,1.0,2.0] } abc_tuned1=AdaBoostClassifier(random_state=1) go1=GridSearchCV( estimator=abc_tuned1, param_grid=params, scoring=['recall','f1','roc_auc'], refit='roc_auc', cv=5, n_jobs=-1, verbose=1, return_train_score=True ) start=time.time() go1.fit(Xt_over,yt_over) print(f'Fit completed in {np.round((time.time()-start)/60,2)} minutes.') best_abc1=go1.best_params_ best_abc1 abc_tuned1=AdaBoostClassifier( random_state=1, **best_abc1 ) abc_tuned1.fit(Xt_over,yt_over) get_scores(abc_tuned1,sample='over',output='pandas') ch(abc_tuned1) lr=LogisticRegression( random_state=1, max_iter=1000 ) lr.fit(Xt_over,yt_over) get_scores(lr,sample='over',output='pandas') # logistic regression on non-oversampled data logit=LogisticRegression(max_iter=1000) logit.fit(X_ts,y_train) get_scores(logit,output='pandas') ch(lr) abc_tuned3=AdaBoostClassifier( estimator=LogisticRegression( random_state=2, max_iter=1000 ), random_state=1 ) abc_tuned3.fit(Xt_over,yt_over) get_scores(abc_tuned3,sample='over',output='pandas') ch(abc_tuned3) go3=GridSearchCV( estimator=abc_tuned3, param_grid=params, scoring='precision', cv=5, n_jobs=-1, verbose=1, return_train_score=True ) go3.fit(Xt_over,yt_over) best_abc3=go3.best_params_ best_abc3 abc_tuned3=AdaBoostClassifier( estimator=LogisticRegression( random_state=2, max_iter=1000 ), random_state=1, **best_abc3 ) abc_tuned3.fit(Xt_over,yt_over) get_scores(abc_tuned3,sample='over',output='pandas') params={ 'eta':np.linspace(0.05,0.3,6), 'max_depth':np.arange(2,5), 'min_child_weight':[1,2], 'subsample':np.linspace(0.5,0.9,4), 'colsample_bytree':np.linspace(0.5,0.9,4) } xgb_tuned=XGBClassifier( tree_method='gpu_hist', random_state=1 ) go=GridSearchCV( estimator=xgb_tuned, param_grid=params, scoring=['recall','f1','roc_auc'], refit='roc_auc', cv=5, n_jobs=-1, verbose=1, return_train_score=True ) start=time.time() go.fit(Xt_over,yt_over) print(f'Fit completed in {np.round((time.time()-start)/60,2)} minutes.') best_xgb1=go.best_params_ best_xgb1 xgb_tuned1=XGBClassifier( tree_method='gpu_hist', random_state=1, **best_xgb1 ) xgb_tuned1.fit(Xt_over,yt_over) ch(xgb_tuned1) get_scores(xgb_tuned1,sample='over',output='pandas') results=go.cv_results_ # figure setup plt.figure(figsize=(12,8)) plt.title('Mean Recall, ROC-AUC, and F1 for max_depth',fontsize=20) # recall training a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_train_recall'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(2,5), a, label='Train Recall', linestyle='--', drawstyle='steps-mid') # AUC training a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_train_roc_auc'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(2,5), a, label='Train AUC', linestyle='--', drawstyle='steps-mid') # F1 training a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_train_f1'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(2,5), a, label='Train F1', linestyle='--', drawstyle='steps-mid') # recall val a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_test_recall'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(2,5), a, label='Val. Recall', drawstyle='steps-mid') # AUC val a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_test_roc_auc'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(2,5), a, label='Val. AUC', drawstyle='steps-mid') # F1 val a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_test_f1'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(2,5), a, label='Val. F1', drawstyle='steps-mid') # axes and legend plt.xlabel('Maximum Tree Depth') plt.ylabel('Mean Score') plt.legend(loc='lower right') plt.show() plt.figure(figsize=(12,8)) plt.title('Recall, ROC-AUC, and F1 for eta',fontsize=20) # recall train sns.lineplot(x=results['param_eta'], y=results['mean_train_recall'], errorbar=('ci',0), label='Train Recall', linestyle='--') # AUC train sns.lineplot(x=results['param_eta'], y=results['mean_train_roc_auc'], errorbar=('ci',0), label='Train AUC', linestyle='--') # F1 train sns.lineplot(x=results['param_eta'], y=results['mean_train_f1'], errorbar=('ci',0), label='Train F1', linestyle='--') # recall val sns.lineplot(x=results['param_eta'], y=results['mean_test_recall'], errorbar=('ci',0), label='Val. Recall') # AUC val sns.lineplot(x=results['param_eta'], y=results['mean_test_roc_auc'], errorbar=('ci',0), label='Val. AUC') # F1 val sns.lineplot(x=results['param_eta'], y=results['mean_test_f1'], errorbar=('ci',0), label='Val. F1') plt.xlabel('eta (learning rate)') plt.ylabel('Mean Score') plt.legend(loc='lower right') plt.show() plt.figure(figsize=(15,15)) for i in range(9): # collect params and set gamma w=xgb_tuned1.get_params() w['gamma']=2*i # convert to XGB DMatrix format dmat=DMatrix( Xt_over, yt_over, enable_categorical=True ) # cv a=XGB_CV( params=w, dtrain=dmat, num_boost_round=250, nfold=5, metrics={'auc'} ) # subplot plt.subplot(3,3,i+1) plt.title(f'gamma={2*i}') plt.plot(np.arange(250),a['train-auc-mean'],label='train') plt.plot(np.arange(250),a['test-auc-mean'],label='test') plt.legend(loc='lower right') clear_output() plt.show() w=xgb_tuned1.get_params() w['gamma']=8 xgb_tuned2=XGBClassifier(**w) xgb_tuned2.fit(Xt_over,yt_over) ch(xgb_tuned2) # gamma-tuned model get_scores(xgb_tuned2,sample='over',output='pandas') # tuned model with default gamma get_scores(xgb_tuned1,sample='over',output='pandas') params={ 'max_bin':[100,150,200], 'min_gain_to_split':[0.001,0.005,0.01], 'feature_fraction':np.linspace(0.5,0.9,4), 'max_depth':np.arange(3,10,2) } lg_tuned=LGBMClassifier(random_state=1) go=GridSearchCV( estimator=lg_tuned, param_grid=params, scoring=['recall','f1','roc_auc'], refit='roc_auc', cv=5, n_jobs=-1, verbose=1, return_train_score=True ) start=time.time() go.fit(Xt_over,yt_over) print(f'Fit completed in {np.round((time.time()-start)/60,2)} minutes.') best_lg=go.best_params_ best_lg lg_tuned=LGBMClassifier( random_state=1, **best_lg ) lg_tuned.fit(Xt_over,yt_over) ch(lg_tuned) get_scores(lg_tuned,sample='over',output='pandas') results=go.cv_results_ # figure setup plt.figure(figsize=(12,8)) plt.title('Mean Recall, ROC-AUC, and F1 for max_depth',fontsize=20) # recall training a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_train_recall'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(3,10,2), a, label='Train Recall', linestyle='--', drawstyle='steps-mid') # AUC training a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_train_roc_auc'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(3,10,2), a, label='Train AUC', linestyle='--', drawstyle='steps-mid') # F1 training a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_train_f1'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(3,10,2), a, label='Train F1', linestyle='--', drawstyle='steps-mid') # recall val a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_test_recall'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(3,10,2), a, label='Val. Recall', drawstyle='steps-mid') # AUC val a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_test_roc_auc'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(3,10,2), a, label='Val. AUC', drawstyle='steps-mid') # F1 val a=pd.DataFrame( data=np.array([ results['param_max_depth'], results['mean_test_f1'] ]).T, columns=['depth',''] ).groupby('depth').mean().values plt.plot(np.arange(3,10,2), a, label='Val. F1', drawstyle='steps-mid') # axes and legend plt.xlabel('Maximum Tree Depth') plt.ylabel('Mean Score') plt.legend(loc='lower right') plt.show() lg1=LGBMClassifier( random_state=1, is_unbalance=True ) lg1.fit(X_ts,y_train) get_scores(lg1,output='pandas') ch(lg1) params={ 'max_bin':[100,150,200,255], 'feature_fraction':np.linspace(0.5,0.9,4), } lg1=LGBMClassifier( random_state=1, is_unbalance=True ) go=GridSearchCV( estimator=lg1, param_grid=params, scoring=['recall','f1','roc_auc'], refit='roc_auc', cv=5, n_jobs=-1, verbose=1, return_train_score=True ) start=time.time() go.fit(X_ts,y_train) print(f'Fit completed in {np.round((time.time()-start)/60,2)} minutes.') best_lg1=go.best_params_ best_lg1 lg1=LGBMClassifier( random_state=1, is_unbalance=True, **best_lg1 ) lg1.fit(X_ts,y_train) ch(lg1) get_scores(lg1,output='pandas') results=go.cv_results_ plt.figure(figsize=(12,8)) plt.title('Recall and ROC-AUC for Feature Fraction',fontsize=20) # recall train sns.lineplot(x=results['param_feature_fraction'], y=results['mean_train_recall'], errorbar=('ci',0), label='Train Recall', linestyle='--') # AUC train sns.lineplot(x=results['param_feature_fraction'], y=results['mean_train_roc_auc'], errorbar=('ci',0), label='Train AUC', linestyle='--') # recall val sns.lineplot(x=results['param_feature_fraction'], y=results['mean_test_recall'], errorbar=('ci',0), label='Val. Recall') # AUC val sns.lineplot(x=results['param_feature_fraction'], y=results['mean_test_roc_auc'], errorbar=('ci',0), label='Val. AUC') plt.xlabel('Feature Fraction') plt.ylabel('Mean Score') plt.legend(loc='best') plt.show() plt.figure(figsize=(12,8)) plt.title('Recall and ROC-AUC for max_bin',fontsize=20) # recall train sns.lineplot(x=results['param_max_bin'], y=results['mean_train_recall'], errorbar=('ci',0), label='Train Recall', linestyle='--', drawstyle='steps-mid') # AUC train sns.lineplot(x=results['param_max_bin'], y=results['mean_train_roc_auc'], errorbar=('ci',0), label='Train AUC', linestyle='--', drawstyle='steps-mid') # recall val sns.lineplot(x=results['param_max_bin'], y=results['mean_test_recall'], errorbar=('ci',0), label='Val. Recall', drawstyle='steps-mid') # AUC val sns.lineplot(x=results['param_max_bin'], y=results['mean_test_roc_auc'], errorbar=('ci',0), label='Val. AUC', drawstyle='steps-mid') plt.xlabel('Max Bin') plt.ylabel('Mean Score') plt.legend(loc='best') plt.show() plt.figure(figsize=(12,8)) plt.title('F1 for max_bin',fontsize=20) # F1 train sns.lineplot(x=results['param_max_bin'], y=results['mean_train_f1'], errorbar=('ci',0), label='Train F1', linestyle='--', drawstyle='steps-post') # F1 val sns.lineplot(x=results['param_max_bin'], y=results['mean_test_f1'], errorbar=('ci',0), label='Val. F1', drawstyle='steps-post') plt.xlabel('Max Bin') plt.ylabel('Mean Score') plt.legend(loc='best') plt.show()