#!/usr/bin/env python # coding: utf-8 # In[1]: import os, pickle import pandas as pd import numpy as np # import matplotlib.pyplot as plt from IPython.display import display # from sklearn import metrics from sklearn.model_selection import train_test_split # import xgboost as xgb from xgboost import plot_importance # some settings for displaying Pandas results pd.set_option('display.width', 2000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.precision', 4) pd.set_option('display.max_colwidth', -1) # # Load combined features with label (option 1) # In[2]: get_ipython().run_cell_magic('time', '', '# specified features set for joining\nls_feat_file = [\n \'baseline.pkl.bz2\',\n \'baseline_extend.pkl.bz2\',\n]\n\n# use first features for base joined\nfeat_path = os.path.join("../04_feature_engineering/features", ls_feat_file[0])\npdf_combined = pd.read_pickle(feat_path, compression="bz2")\n\n# join next features set\nfor fname in ls_feat_file[1:]:\n feat_path = os.path.join("../04_feature_engineering/features", fname)\n pdf_feat = pd.read_pickle(feat_path, compression="bz2")\n print(fname, pdf_feat.shape)\n \n # add table prefix\n tbl_prefix = fname.split(".")[0]\n rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}\n pdf_feat.rename(columns=rename_col, inplace=True)\n \n # join\n pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")\n\nprint("rows, columns", pdf_combined.shape)\nls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]\ndisplay(pdf_combined.head())\n') # In[ ]: # join with label pdf_tvt = pd.read_pickle("../04_feature_engineering/pdf_tvt_extend.pkl", compression="bz2") pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left") print(pdf_features_label.shape) # # Load combined features with label (option 2) # In[2]: if True: pdf_features_label = pd.read_csv(os.path.join("../04_feature_engineering/features", "pdf_features_label.csv.bz2"), compression="bz2") meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"] ls_features = [cname for cname in pdf_features_label.columns if cname not in meta_cols] # print("Number of features: {}".format(len(ls_features))) print(pdf_features_label.shape) # # Grid search # In[3]: version = "v07" # In[4]: # read model with open("models/xgb_model_{}.mod".format(version), "rb") as input_file: res_model = pickle.load(input_file) res_model.keys() # In[5]: meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"] ls_features = [feat for feat, val in res_model["imp"]] pdf_features_label = pdf_features_label[meta_cols + ls_features] print("Selected features: {}".format(len(ls_features))) # In[6]: from sklearn.model_selection import GridSearchCV, StratifiedKFold pdf_data = pdf_features_label[pdf_features_label["tvt_code"].isin(["train", "val", "test"])].copy() pdf_data.shape # In[ ]: get_ipython().run_cell_magic('time', '', 'param_grid = {\n "objective": ["binary:logistic"], \n "booster": ["gbtree"], \n "max_depth": [4, 7], # default: 3 only for depthwise\n "n_estimators": [1000], # default: 500 \n "learning_rate": [0.025], # default: 0.05 \n "subsample": [0.6, 0.8], \n "colsample_bytree": [0.6, 0.8], # default: 1.0\n "colsample_bylevel": [0.6, 0.8], # default: 1.0\n "random_state": [1],\n \'min_child_weight\': [11],\n\n #\n "silent": [True], \n \'seed\': [1]\n}\n\nxgb_model = xgb.XGBClassifier()\ngrid_search = GridSearchCV(xgb_model, param_grid, n_jobs=16, \n cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=1), \n scoring=\'roc_auc\',\n verbose=2)\n\ngrid_result = grid_search.fit(pdf_data[ls_features], pdf_data["TARGET"])\n') # In[15]: print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) # means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param)) # # for check_param in ["max_depth", "subsample", "colsample_bytree", "colsample_bylevel"]: # plt.errorbar(param_grid[check_param], means, yerr=stds) # plt.title("XGBoost {} vs AUC".format(check_param)) # plt.xlabel(check_param) # plt.ylabel('AUC') # plt.show() # # Submission # In[10]: X_kaggle_test = pdf_features_label.query("tvt_code == 'kaggle_test'")[ls_features] y_test_pred = grid_search.predict_proba(X_kaggle_test)[:, 1] y_test_pred.mean() # In[11]: SK_IDs = pdf_features_label.query("tvt_code == 'kaggle_test'")["SK_ID_CURR"].tolist() pdf_submiss = pd.DataFrame({"SK_ID_CURR": SK_IDs, "TARGET": y_test_pred}) pdf_submiss.to_csv("submissions/submission_gridsearch_{}.csv".format(version), index=False) pdf_submiss.head() # In[12]: # save model to file res_model = { "grid_search": grid_search, "grid_result": grid_result } pickle.dump(res_model, open("models/xgb_gridsearch_{}.mod".format(version), "wb"))