In [1]:

import os, pickle
import pandas as pd
import numpy as np
# 
import matplotlib.pyplot as plt
from IPython.display import display
# 
from sklearn import metrics
from sklearn.model_selection import train_test_split
# 
import xgboost as xgb
from xgboost import plot_importance

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)

Load combined features with label (option 1)¶

In [2]:

%%time
# specified features set for joining
ls_feat_file = [
    'baseline.pkl.bz2',
    'baseline_extend.pkl.bz2',
]

# use first features for base joined
feat_path = os.path.join("../04_feature_engineering/features", ls_feat_file[0])
pdf_combined = pd.read_pickle(feat_path, compression="bz2")

# join next features set
for fname in ls_feat_file[1:]:
    feat_path = os.path.join("../04_feature_engineering/features", fname)
    pdf_feat = pd.read_pickle(feat_path, compression="bz2")
    print(fname, pdf_feat.shape)
    
    # add table prefix
    tbl_prefix = fname.split(".")[0]
    rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}
    pdf_feat.rename(columns=rename_col, inplace=True)
    
    # join
    pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")

print("rows, columns", pdf_combined.shape)
ls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]
display(pdf_combined.head())

baseline_extend.pkl.bz2 (356255, 77)
rows, columns (356255, 154)

	SK_ID_CURR	NAME_INCOME_TYPE_Working	is_REGION_RATING_CLIENT_W_CITY	is_REGION_RATING_CLIENT	is_CODE_GENDER	NAME_EDUCATION_TYPE_Higher_education	NAME_EDUCATION_TYPE_Secondary___secondary_special	is_REG_CITY_NOT_WORK_CITY	is_FLAG_DOCUMENT_3	HOUSETYPE_MODE_block_of_flats	is_FLAG_EMP_PHONE	OCCUPATION_TYPE_Laborers	is_LIVE_CITY_NOT_WORK_CITY	NAME_FAMILY_STATUS_Married	is_FLAG_WORK_PHONE	is_FLAG_PHONE	is_FLAG_OWN_CAR	ORGANIZATION_TYPE_Business_Entity_Type_3	NAME_FAMILY_STATUS_Single___not_married	FONDKAPREMONT_MODE_reg_oper_account	is_NAME_CONTRACT_TYPE	NAME_HOUSING_TYPE_House___apartment	NAME_FAMILY_STATUS_Civil_marriage	NAME_INCOME_TYPE_State_servant	OCCUPATION_TYPE_Core_staff	WALLSMATERIAL_MODE_Stone,_brick	is_FLAG_OWN_REALTY	NAME_TYPE_SUITE_Unaccompanied	is_FLAG_DOCUMENT_8	NAME_TYPE_SUITE_Family	ORGANIZATION_TYPE_School	WALLSMATERIAL_MODE_Block	ORGANIZATION_TYPE_Government	baseline_extend_AMT_INCOME_TOTAL	baseline_extend_AMT_CREDIT	baseline_extend_AMT_ANNUITY	baseline_extend_AMT_GOODS_PRICE	baseline_extend_REGION_POPULATION_RELATIVE	baseline_extend_DAYS_REGISTRATION	baseline_extend_OWN_CAR_AGE	baseline_extend_CNT_FAM_MEMBERS	baseline_extend_EXT_SOURCE_1	baseline_extend_EXT_SOURCE_2	baseline_extend_EXT_SOURCE_3	baseline_extend_APARTMENTS_AVG	baseline_extend_BASEMENTAREA_AVG	baseline_extend_YEARS_BEGINEXPLUATATION_AVG	baseline_extend_YEARS_BUILD_AVG	baseline_extend_COMMONAREA_AVG	baseline_extend_ELEVATORS_AVG	baseline_extend_ENTRANCES_AVG	baseline_extend_FLOORSMAX_AVG	baseline_extend_FLOORSMIN_AVG	baseline_extend_LANDAREA_AVG	baseline_extend_LIVINGAPARTMENTS_AVG	baseline_extend_LIVINGAREA_AVG	baseline_extend_NONLIVINGAPARTMENTS_AVG	baseline_extend_NONLIVINGAREA_AVG	baseline_extend_APARTMENTS_MODE	baseline_extend_BASEMENTAREA_MODE	baseline_extend_YEARS_BEGINEXPLUATATION_MODE	baseline_extend_YEARS_BUILD_MODE	baseline_extend_COMMONAREA_MODE	baseline_extend_ELEVATORS_MODE	baseline_extend_ENTRANCES_MODE	baseline_extend_FLOORSMAX_MODE	baseline_extend_FLOORSMIN_MODE	baseline_extend_LANDAREA_MODE	baseline_extend_LIVINGAPARTMENTS_MODE	baseline_extend_LIVINGAREA_MODE	baseline_extend_NONLIVINGAPARTMENTS_MODE	baseline_extend_NONLIVINGAREA_MODE	baseline_extend_APARTMENTS_MEDI	baseline_extend_BASEMENTAREA_MEDI	baseline_extend_YEARS_BEGINEXPLUATATION_MEDI	baseline_extend_YEARS_BUILD_MEDI	baseline_extend_COMMONAREA_MEDI	baseline_extend_ELEVATORS_MEDI	baseline_extend_ENTRANCES_MEDI	baseline_extend_FLOORSMAX_MEDI	baseline_extend_FLOORSMIN_MEDI	baseline_extend_LANDAREA_MEDI	baseline_extend_LIVINGAPARTMENTS_MEDI	baseline_extend_LIVINGAREA_MEDI	baseline_extend_NONLIVINGAPARTMENTS_MEDI	baseline_extend_NONLIVINGAREA_MEDI	baseline_extend_TOTALAREA_MODE	baseline_extend_OBS_30_CNT_SOCIAL_CIRCLE	baseline_extend_DEF_30_CNT_SOCIAL_CIRCLE	baseline_extend_OBS_60_CNT_SOCIAL_CIRCLE	baseline_extend_DEF_60_CNT_SOCIAL_CIRCLE	baseline_extend_DAYS_LAST_PHONE_CHANGE	baseline_extend_AMT_REQ_CREDIT_BUREAU_HOUR	baseline_extend_AMT_REQ_CREDIT_BUREAU_DAY	baseline_extend_AMT_REQ_CREDIT_BUREAU_WEEK	baseline_extend_AMT_REQ_CREDIT_BUREAU_MON	baseline_extend_AMT_REQ_CREDIT_BUREAU_QRT	baseline_extend_AMT_REQ_CREDIT_BUREAU_YEAR	baseline_extend_CREDIT_INCOME_PERCENT	baseline_extend_ANNUITY_INCOME_PERCENT	baseline_extend_CREDIT_TERM	baseline_extend_YEARS_BIRTH	baseline_extend_REGISTRATION_YEAR	baseline_extend_ID_PUBLISH_YEAR	baseline_extend_LAST_PHONE_CHANGE_YEAR	baseline_extend_DAYS_EMPLOYED_ANOM	baseline_extend_DAYS_EMPLOYED	baseline_extend_YEARS_EMPLOYED	baseline_extend_YEARS_EMPLOYED_PERCENT
0	100002	1	2	2	1	0	1	0	1	1	1	1	0	0	0	1	0	1	1	1	1	1	0	0	0	1	1	1	0	0	0	0	0	202500.0	406597.5	24700.5	351000.0	0.0188	-3648.0	0.0	1.0	0.0830	0.2629	0.1394	0.0247	0.0369	0.9722	0.6192	0.0143	0.00	0.0690	0.0833	0.1250	0.0369	0.0202	0.0190	0.0000	0.0000	0.0252	0.0383	0.9722	0.6341	0.0144	0.0000	0.0690	0.0833	0.1250	0.0377	0.022	0.0198	0.0	0.0	0.0250	0.0369	0.9722	0.6243	0.0144	0.00	0.0690	0.0833	0.1250	0.0375	0.0205	0.0193	0.0000	0.00	0.0149	2.0	2.0	2.0	2.0	-1134.0	0.0	0.0	0.0	0.0	0.0	1.0	2.0079	0.1220	0.0607	25.9205	9.9945	5.8082	3.1068	False	-637.0	1.7452	0.0673
1	100003	0	1	1	0	1	0	0	1	1	1	0	0	1	0	1	0	0	0	1	1	1	0	1	1	0	0	0	0	1	1	1	0	270000.0	1293502.5	35698.5	1129500.0	0.0035	-1186.0	0.0	2.0	0.3113	0.6222	NaN	0.0959	0.0529	0.9851	0.7960	0.0605	0.08	0.0345	0.2917	0.3333	0.0130	0.0773	0.0549	0.0039	0.0098	0.0924	0.0538	0.9851	0.8040	0.0497	0.0806	0.0345	0.2917	0.3333	0.0128	0.079	0.0554	0.0	0.0	0.0968	0.0529	0.9851	0.7987	0.0608	0.08	0.0345	0.2917	0.3333	0.0132	0.0787	0.0558	0.0039	0.01	0.0714	1.0	0.0	1.0	0.0	-828.0	0.0	0.0	0.0	0.0	0.0	0.0	4.7908	0.1322	0.0276	45.9315	3.2493	0.7973	2.2685	False	-1188.0	3.2548	0.0709
2	100004	1	2	2	1	0	1	0	0	0	1	1	0	0	1	1	1	0	1	0	0	1	0	0	0	0	1	1	0	0	0	0	1	67500.0	135000.0	6750.0	135000.0	0.0100	-4260.0	26.0	1.0	NaN	0.5559	0.7296	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	0.0	0.0	0.0	-815.0	0.0	0.0	0.0	0.0	0.0	0.0	2.0000	0.1000	0.0500	52.1808	11.6712	6.9342	2.2329	False	-225.0	0.6164	0.0118
3	100006	1	2	2	0	0	1	0	1	0	1	1	0	0	0	0	0	1	0	0	1	1	1	0	0	0	1	1	0	0	0	0	0	135000.0	312682.5	29686.5	297000.0	0.0080	-9833.0	0.0	2.0	NaN	0.6504	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	0.0	2.0	0.0	-617.0	NaN	NaN	NaN	NaN	NaN	NaN	2.3162	0.2199	0.0949	52.0685	26.9397	6.6767	1.6904	False	-3039.0	8.3260	0.1599
4	100007	1	2	2	1	0	1	1	0	0	1	0	1	0	0	0	0	0	1	0	1	1	0	0	1	0	1	1	1	0	0	0	0	121500.0	513000.0	21865.5	513000.0	0.0287	-4311.0	0.0	1.0	NaN	0.3227	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	0.0	0.0	0.0	-1106.0	0.0	0.0	0.0	0.0	0.0	0.0	4.2222	0.1800	0.0426	54.6082	11.8110	9.4740	3.0301	False	-3038.0	8.3233	0.1524

CPU times: user 31.4 s, sys: 2.02 s, total: 33.4 s
Wall time: 33.5 s

In [ ]:

# join with label
pdf_tvt = pd.read_pickle("../04_feature_engineering/pdf_tvt_extend.pkl", compression="bz2")
pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left")
print(pdf_features_label.shape)

Load combined features with label (option 2)¶

In [2]:

if True:
    pdf_features_label = pd.read_csv(os.path.join("../04_feature_engineering/features", "pdf_features_label.csv.bz2"), compression="bz2")
    meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"]
    ls_features = [cname for cname in pdf_features_label.columns if cname not in meta_cols]

    # 
    print("Number of features: {}".format(len(ls_features)))
    print(pdf_features_label.shape)

Number of features: 1042
(356255, 1045)

Grid search¶

In [3]:

version = "v07"

In [4]:

# read model
with open("models/xgb_model_{}.mod".format(version), "rb") as input_file:
    res_model = pickle.load(input_file)
res_model.keys()

/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/base.py:251: UserWarning: Trying to unpickle estimator LabelEncoder from version 0.19.1 when using version 0.20.0. This might lead to breaking code or invalid results. Use at your own risk.
  UserWarning)

Out[4]:

odict_keys(['auc', 'ls_tracked_auc', 'ls_curr_features', 'imp', 'ls_tracked_imp', 'model', 'features'])

In [5]:

meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"]
ls_features = [feat for feat, val in res_model["imp"]]
pdf_features_label = pdf_features_label[meta_cols + ls_features]
print("Selected features: {}".format(len(ls_features)))

Selected features: 905

In [6]:

from sklearn.model_selection import GridSearchCV, StratifiedKFold
pdf_data = pdf_features_label[pdf_features_label["tvt_code"].isin(["train", "val", "test"])].copy()
pdf_data.shape

Out[6]:

(307511, 908)

In [ ]:

%%time
param_grid = {
    "objective": ["binary:logistic"], 
    "booster": ["gbtree"], 
    "max_depth": [4, 7], # default: 3 only for depthwise
    "n_estimators": [1000], # default: 500  
    "learning_rate": [0.025], # default: 0.05 
    "subsample": [0.6, 0.8], 
    "colsample_bytree": [0.6, 0.8],  # default:  1.0
    "colsample_bylevel": [0.6, 0.8], # default: 1.0
    "random_state": [1],
    'min_child_weight': [11],

    #
    "silent": [True], 
    'seed': [1]
}

xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(xgb_model, param_grid, n_jobs=16, 
                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=1), 
                   scoring='roc_auc',
                   verbose=2)

grid_result = grid_search.fit(pdf_data[ls_features], pdf_data["TARGET"])

Fitting 5 folds for each of 16 candidates, totalling 80 fits

[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.

In [15]:

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# 
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
# 
# for check_param in ["max_depth", "subsample", "colsample_bytree", "colsample_bylevel"]:
#     plt.errorbar(param_grid[check_param], means, yerr=stds) 
#     plt.title("XGBoost {} vs AUC".format(check_param)) 
#     plt.xlabel(check_param)
#     plt.ylabel('AUC') 
#     plt.show()

Best: 0.790018 using {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.787458 (0.001713) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.787668 (0.001883) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.789044 (0.002316) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.790018 (0.001945) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.787579 (0.001834) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.787813 (0.001535) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.789147 (0.002190) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.789775 (0.002219) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.787652 (0.001759) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.787831 (0.001788) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.788921 (0.002179) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.789692 (0.001530) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.787874 (0.001682) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.787952 (0.001679) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
0.789661 (0.002410) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6}
0.789685 (0.002188) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}

Submission¶

In [10]:

X_kaggle_test = pdf_features_label.query("tvt_code == 'kaggle_test'")[ls_features]
y_test_pred = grid_search.predict_proba(X_kaggle_test)[:, 1]
y_test_pred.mean()

Out[10]:

0.06975483

In [11]:

SK_IDs = pdf_features_label.query("tvt_code == 'kaggle_test'")["SK_ID_CURR"].tolist()
pdf_submiss = pd.DataFrame({"SK_ID_CURR": SK_IDs, "TARGET": y_test_pred})
pdf_submiss.to_csv("submissions/submission_gridsearch_{}.csv".format(version), index=False)
pdf_submiss.head()

Out[11]:

	SK_ID_CURR	TARGET
0	100001	0.0267
1	100005	0.1182
2	100013	0.0361
3	100028	0.0458
4	100038	0.1852

In [12]:

# save model to file
res_model = {
    "grid_search": grid_search,
    "grid_result": grid_result
}
pickle.dump(res_model, open("models/xgb_gridsearch_{}.mod".format(version), "wb"))