import os, pickle
import pandas as pd
import numpy as np
#
import matplotlib.pyplot as plt
from IPython.display import display
#
from sklearn import metrics
from sklearn.model_selection import train_test_split
#
import xgboost as xgb
from xgboost import plot_importance
# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)
%%time
# specified features set for joining
ls_feat_file = [
'baseline.pkl.bz2',
'baseline_extend.pkl.bz2',
]
# use first features for base joined
feat_path = os.path.join("../04_feature_engineering/features", ls_feat_file[0])
pdf_combined = pd.read_pickle(feat_path, compression="bz2")
# join next features set
for fname in ls_feat_file[1:]:
feat_path = os.path.join("../04_feature_engineering/features", fname)
pdf_feat = pd.read_pickle(feat_path, compression="bz2")
print(fname, pdf_feat.shape)
# add table prefix
tbl_prefix = fname.split(".")[0]
rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}
pdf_feat.rename(columns=rename_col, inplace=True)
# join
pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")
print("rows, columns", pdf_combined.shape)
ls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]
display(pdf_combined.head())
baseline_extend.pkl.bz2 (356255, 77) rows, columns (356255, 154)
SK_ID_CURR | NAME_INCOME_TYPE_Working | is_REGION_RATING_CLIENT_W_CITY | is_REGION_RATING_CLIENT | is_CODE_GENDER | NAME_EDUCATION_TYPE_Higher_education | NAME_EDUCATION_TYPE_Secondary___secondary_special | is_REG_CITY_NOT_WORK_CITY | is_FLAG_DOCUMENT_3 | HOUSETYPE_MODE_block_of_flats | NAME_INCOME_TYPE_Pensioner | ORGANIZATION_TYPE_XNA | is_FLAG_EMP_PHONE | OCCUPATION_TYPE_Laborers | WALLSMATERIAL_MODE_Panel | is_LIVE_CITY_NOT_WORK_CITY | NAME_FAMILY_STATUS_Married | is_FLAG_WORK_PHONE | is_FLAG_PHONE | is_FLAG_OWN_CAR | ORGANIZATION_TYPE_Self_employed | ORGANIZATION_TYPE_Business_Entity_Type_3 | NAME_FAMILY_STATUS_Single___not_married | FONDKAPREMONT_MODE_reg_oper_account | is_NAME_CONTRACT_TYPE | NAME_HOUSING_TYPE_House___apartment | is_FLAG_DOCUMENT_6 | OCCUPATION_TYPE_Drivers | NAME_FAMILY_STATUS_Civil_marriage | NAME_HOUSING_TYPE_With_parents | NAME_INCOME_TYPE_State_servant | OCCUPATION_TYPE_Core_staff | OCCUPATION_TYPE_Sales_staff | NAME_INCOME_TYPE_Commercial_associate | WALLSMATERIAL_MODE_Stone,_brick | NAME_FAMILY_STATUS_Widow | OCCUPATION_TYPE_Managers | OCCUPATION_TYPE_Accountants | is_FLAG_OWN_REALTY | ORGANIZATION_TYPE_Construction | NAME_TYPE_SUITE_Unaccompanied | is_FLAG_DOCUMENT_8 | NAME_TYPE_SUITE_Family | OCCUPATION_TYPE_High_skill_tech_staff | ORGANIZATION_TYPE_School | NAME_HOUSING_TYPE_Rented_apartment | OCCUPATION_TYPE_Low_skill_Laborers | OCCUPATION_TYPE_Security_staff | FONDKAPREMONT_MODE_reg_oper_spec_account | ORGANIZATION_TYPE_Medicine | FONDKAPREMONT_MODE_org_spec_account | WALLSMATERIAL_MODE_Block | OCCUPATION_TYPE_Cooking_staff | is_REG_REGION_NOT_WORK_REGION | NAME_EDUCATION_TYPE_Lower_secondary | ORGANIZATION_TYPE_Government | ORGANIZATION_TYPE_Trade__type_7 | OCCUPATION_TYPE_Medicine_staff | ORGANIZATION_TYPE_Military | ORGANIZATION_TYPE_Industry__type_3 | ORGANIZATION_TYPE_Bank | ORGANIZATION_TYPE_Transport__type_3 | ORGANIZATION_TYPE_Police | ORGANIZATION_TYPE_Restaurant | ORGANIZATION_TYPE_Kindergarten | ORGANIZATION_TYPE_Security | ORGANIZATION_TYPE_Agriculture | OCCUPATION_TYPE_Cleaning_staff | WALLSMATERIAL_MODE_Wooden | ORGANIZATION_TYPE_Security_Ministries | ORGANIZATION_TYPE_Trade__type_3 | ORGANIZATION_TYPE_Business_Entity_Type_2 | ORGANIZATION_TYPE_Other | is_REG_REGION_NOT_LIVE_REGION | NAME_EDUCATION_TYPE_Incomplete_higher | WALLSMATERIAL_MODE_Monolithic | ORGANIZATION_TYPE_Transport__type_4 | OCCUPATION_TYPE_Waiters_barmen_staff | baseline_extend_AMT_INCOME_TOTAL | baseline_extend_AMT_CREDIT | baseline_extend_AMT_ANNUITY | baseline_extend_AMT_GOODS_PRICE | baseline_extend_REGION_POPULATION_RELATIVE | baseline_extend_DAYS_REGISTRATION | baseline_extend_OWN_CAR_AGE | baseline_extend_CNT_FAM_MEMBERS | baseline_extend_EXT_SOURCE_1 | baseline_extend_EXT_SOURCE_2 | baseline_extend_EXT_SOURCE_3 | baseline_extend_APARTMENTS_AVG | baseline_extend_BASEMENTAREA_AVG | baseline_extend_YEARS_BEGINEXPLUATATION_AVG | baseline_extend_YEARS_BUILD_AVG | baseline_extend_COMMONAREA_AVG | baseline_extend_ELEVATORS_AVG | baseline_extend_ENTRANCES_AVG | baseline_extend_FLOORSMAX_AVG | baseline_extend_FLOORSMIN_AVG | baseline_extend_LANDAREA_AVG | baseline_extend_LIVINGAPARTMENTS_AVG | baseline_extend_LIVINGAREA_AVG | baseline_extend_NONLIVINGAPARTMENTS_AVG | baseline_extend_NONLIVINGAREA_AVG | baseline_extend_APARTMENTS_MODE | baseline_extend_BASEMENTAREA_MODE | baseline_extend_YEARS_BEGINEXPLUATATION_MODE | baseline_extend_YEARS_BUILD_MODE | baseline_extend_COMMONAREA_MODE | baseline_extend_ELEVATORS_MODE | baseline_extend_ENTRANCES_MODE | baseline_extend_FLOORSMAX_MODE | baseline_extend_FLOORSMIN_MODE | baseline_extend_LANDAREA_MODE | baseline_extend_LIVINGAPARTMENTS_MODE | baseline_extend_LIVINGAREA_MODE | baseline_extend_NONLIVINGAPARTMENTS_MODE | baseline_extend_NONLIVINGAREA_MODE | baseline_extend_APARTMENTS_MEDI | baseline_extend_BASEMENTAREA_MEDI | baseline_extend_YEARS_BEGINEXPLUATATION_MEDI | baseline_extend_YEARS_BUILD_MEDI | baseline_extend_COMMONAREA_MEDI | baseline_extend_ELEVATORS_MEDI | baseline_extend_ENTRANCES_MEDI | baseline_extend_FLOORSMAX_MEDI | baseline_extend_FLOORSMIN_MEDI | baseline_extend_LANDAREA_MEDI | baseline_extend_LIVINGAPARTMENTS_MEDI | baseline_extend_LIVINGAREA_MEDI | baseline_extend_NONLIVINGAPARTMENTS_MEDI | baseline_extend_NONLIVINGAREA_MEDI | baseline_extend_TOTALAREA_MODE | baseline_extend_OBS_30_CNT_SOCIAL_CIRCLE | baseline_extend_DEF_30_CNT_SOCIAL_CIRCLE | baseline_extend_OBS_60_CNT_SOCIAL_CIRCLE | baseline_extend_DEF_60_CNT_SOCIAL_CIRCLE | baseline_extend_DAYS_LAST_PHONE_CHANGE | baseline_extend_AMT_REQ_CREDIT_BUREAU_HOUR | baseline_extend_AMT_REQ_CREDIT_BUREAU_DAY | baseline_extend_AMT_REQ_CREDIT_BUREAU_WEEK | baseline_extend_AMT_REQ_CREDIT_BUREAU_MON | baseline_extend_AMT_REQ_CREDIT_BUREAU_QRT | baseline_extend_AMT_REQ_CREDIT_BUREAU_YEAR | baseline_extend_CREDIT_INCOME_PERCENT | baseline_extend_ANNUITY_INCOME_PERCENT | baseline_extend_CREDIT_TERM | baseline_extend_YEARS_BIRTH | baseline_extend_REGISTRATION_YEAR | baseline_extend_ID_PUBLISH_YEAR | baseline_extend_LAST_PHONE_CHANGE_YEAR | baseline_extend_DAYS_EMPLOYED_ANOM | baseline_extend_DAYS_EMPLOYED | baseline_extend_YEARS_EMPLOYED | baseline_extend_YEARS_EMPLOYED_PERCENT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100002 | 1 | 2 | 2 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 202500.0 | 406597.5 | 24700.5 | 351000.0 | 0.0188 | -3648.0 | 0.0 | 1.0 | 0.0830 | 0.2629 | 0.1394 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0369 | 0.0202 | 0.0190 | 0.0000 | 0.0000 | 0.0252 | 0.0383 | 0.9722 | 0.6341 | 0.0144 | 0.0000 | 0.0690 | 0.0833 | 0.1250 | 0.0377 | 0.022 | 0.0198 | 0.0 | 0.0 | 0.0250 | 0.0369 | 0.9722 | 0.6243 | 0.0144 | 0.00 | 0.0690 | 0.0833 | 0.1250 | 0.0375 | 0.0205 | 0.0193 | 0.0000 | 0.00 | 0.0149 | 2.0 | 2.0 | 2.0 | 2.0 | -1134.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0079 | 0.1220 | 0.0607 | 25.9205 | 9.9945 | 5.8082 | 3.1068 | False | -637.0 | 1.7452 | 0.0673 |
1 | 100003 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 270000.0 | 1293502.5 | 35698.5 | 1129500.0 | 0.0035 | -1186.0 | 0.0 | 2.0 | 0.3113 | 0.6222 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0130 | 0.0773 | 0.0549 | 0.0039 | 0.0098 | 0.0924 | 0.0538 | 0.9851 | 0.8040 | 0.0497 | 0.0806 | 0.0345 | 0.2917 | 0.3333 | 0.0128 | 0.079 | 0.0554 | 0.0 | 0.0 | 0.0968 | 0.0529 | 0.9851 | 0.7987 | 0.0608 | 0.08 | 0.0345 | 0.2917 | 0.3333 | 0.0132 | 0.0787 | 0.0558 | 0.0039 | 0.01 | 0.0714 | 1.0 | 0.0 | 1.0 | 0.0 | -828.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.7908 | 0.1322 | 0.0276 | 45.9315 | 3.2493 | 0.7973 | 2.2685 | False | -1188.0 | 3.2548 | 0.0709 |
2 | 100004 | 1 | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 67500.0 | 135000.0 | 6750.0 | 135000.0 | 0.0100 | -4260.0 | 26.0 | 1.0 | NaN | 0.5559 | 0.7296 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -815.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0000 | 0.1000 | 0.0500 | 52.1808 | 11.6712 | 6.9342 | 2.2329 | False | -225.0 | 0.6164 | 0.0118 |
3 | 100006 | 1 | 2 | 2 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 135000.0 | 312682.5 | 29686.5 | 297000.0 | 0.0080 | -9833.0 | 0.0 | 2.0 | NaN | 0.6504 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 0.0 | 2.0 | 0.0 | -617.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2.3162 | 0.2199 | 0.0949 | 52.0685 | 26.9397 | 6.6767 | 1.6904 | False | -3039.0 | 8.3260 | 0.1599 |
4 | 100007 | 1 | 2 | 2 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 121500.0 | 513000.0 | 21865.5 | 513000.0 | 0.0287 | -4311.0 | 0.0 | 1.0 | NaN | 0.3227 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1106.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.2222 | 0.1800 | 0.0426 | 54.6082 | 11.8110 | 9.4740 | 3.0301 | False | -3038.0 | 8.3233 | 0.1524 |
CPU times: user 31.4 s, sys: 2.02 s, total: 33.4 s Wall time: 33.5 s
# join with label
pdf_tvt = pd.read_pickle("../04_feature_engineering/pdf_tvt_extend.pkl", compression="bz2")
pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left")
print(pdf_features_label.shape)
if True:
pdf_features_label = pd.read_csv(os.path.join("../04_feature_engineering/features", "pdf_features_label.csv.bz2"), compression="bz2")
meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"]
ls_features = [cname for cname in pdf_features_label.columns if cname not in meta_cols]
#
print("Number of features: {}".format(len(ls_features)))
print(pdf_features_label.shape)
Number of features: 1042 (356255, 1045)
version = "v07"
# read model
with open("models/xgb_model_{}.mod".format(version), "rb") as input_file:
res_model = pickle.load(input_file)
res_model.keys()
/home/ubuntu/anaconda3/lib/python3.6/site-packages/sklearn/base.py:251: UserWarning: Trying to unpickle estimator LabelEncoder from version 0.19.1 when using version 0.20.0. This might lead to breaking code or invalid results. Use at your own risk. UserWarning)
odict_keys(['auc', 'ls_tracked_auc', 'ls_curr_features', 'imp', 'ls_tracked_imp', 'model', 'features'])
meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"]
ls_features = [feat for feat, val in res_model["imp"]]
pdf_features_label = pdf_features_label[meta_cols + ls_features]
print("Selected features: {}".format(len(ls_features)))
Selected features: 905
from sklearn.model_selection import GridSearchCV, StratifiedKFold
pdf_data = pdf_features_label[pdf_features_label["tvt_code"].isin(["train", "val", "test"])].copy()
pdf_data.shape
(307511, 908)
%%time
param_grid = {
"objective": ["binary:logistic"],
"booster": ["gbtree"],
"max_depth": [4, 7], # default: 3 only for depthwise
"n_estimators": [1000], # default: 500
"learning_rate": [0.025], # default: 0.05
"subsample": [0.6, 0.8],
"colsample_bytree": [0.6, 0.8], # default: 1.0
"colsample_bylevel": [0.6, 0.8], # default: 1.0
"random_state": [1],
'min_child_weight': [11],
#
"silent": [True],
'seed': [1]
}
xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(xgb_model, param_grid, n_jobs=16,
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=1),
scoring='roc_auc',
verbose=2)
grid_result = grid_search.fit(pdf_data[ls_features], pdf_data["TARGET"])
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
#
# for check_param in ["max_depth", "subsample", "colsample_bytree", "colsample_bylevel"]:
# plt.errorbar(param_grid[check_param], means, yerr=stds)
# plt.title("XGBoost {} vs AUC".format(check_param))
# plt.xlabel(check_param)
# plt.ylabel('AUC')
# plt.show()
Best: 0.790018 using {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.787458 (0.001713) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.787668 (0.001883) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.789044 (0.002316) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.790018 (0.001945) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.787579 (0.001834) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.787813 (0.001535) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.789147 (0.002190) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.789775 (0.002219) with: {'booster': 'gbtree', 'colsample_bylevel': 0.6, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.787652 (0.001759) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.787831 (0.001788) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.788921 (0.002179) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.789692 (0.001530) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.787874 (0.001682) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.787952 (0.001679) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8} 0.789661 (0.002410) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.6} 0.789685 (0.002188) with: {'booster': 'gbtree', 'colsample_bylevel': 0.8, 'colsample_bytree': 0.8, 'learning_rate': 0.025, 'max_depth': 7, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'random_state': 1, 'seed': 1, 'silent': True, 'subsample': 0.8}
X_kaggle_test = pdf_features_label.query("tvt_code == 'kaggle_test'")[ls_features]
y_test_pred = grid_search.predict_proba(X_kaggle_test)[:, 1]
y_test_pred.mean()
0.06975483
SK_IDs = pdf_features_label.query("tvt_code == 'kaggle_test'")["SK_ID_CURR"].tolist()
pdf_submiss = pd.DataFrame({"SK_ID_CURR": SK_IDs, "TARGET": y_test_pred})
pdf_submiss.to_csv("submissions/submission_gridsearch_{}.csv".format(version), index=False)
pdf_submiss.head()
SK_ID_CURR | TARGET | |
---|---|---|
0 | 100001 | 0.0267 |
1 | 100005 | 0.1182 |
2 | 100013 | 0.0361 |
3 | 100028 | 0.0458 |
4 | 100038 | 0.1852 |
# save model to file
res_model = {
"grid_search": grid_search,
"grid_result": grid_result
}
pickle.dump(res_model, open("models/xgb_gridsearch_{}.mod".format(version), "wb"))