#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os, pickle
import pandas as pd
import numpy as np
# 
import matplotlib.pyplot as plt
from IPython.display import display
# 
from sklearn import metrics
from sklearn.model_selection import train_test_split
# 
import xgboost as xgb
from xgboost import plot_importance

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)


# # Load combined features with label (option 1)

# In[2]:


get_ipython().run_cell_magic('time', '', '# specified features set for joining\nls_feat_file = [\n    \'baseline.pkl.bz2\',\n    \'baseline_extend.pkl.bz2\',\n]\n\n# use first features for base joined\nfeat_path = os.path.join("../04_feature_engineering/features", ls_feat_file[0])\npdf_combined = pd.read_pickle(feat_path, compression="bz2")\n\n# join next features set\nfor fname in ls_feat_file[1:]:\n    feat_path = os.path.join("../04_feature_engineering/features", fname)\n    pdf_feat = pd.read_pickle(feat_path, compression="bz2")\n    print(fname, pdf_feat.shape)\n    \n    # add table prefix\n    tbl_prefix = fname.split(".")[0]\n    rename_col = {cname: "{}_{}".format(tbl_prefix, cname) for cname in pdf_feat.columns if cname != "SK_ID_CURR"}\n    pdf_feat.rename(columns=rename_col, inplace=True)\n    \n    # join\n    pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")\n\nprint("rows, columns", pdf_combined.shape)\nls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]\ndisplay(pdf_combined.head())\n')


# In[ ]:


# join with label
pdf_tvt = pd.read_pickle("../04_feature_engineering/pdf_tvt_extend.pkl", compression="bz2")
pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left")
print(pdf_features_label.shape)


# # Load combined features with label (option 2)

# In[2]:


if True:
    pdf_features_label = pd.read_csv(os.path.join("../04_feature_engineering/features", "pdf_features_label.csv.bz2"), compression="bz2")
    meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"]
    ls_features = [cname for cname in pdf_features_label.columns if cname not in meta_cols]

    # 
    print("Number of features: {}".format(len(ls_features)))
    print(pdf_features_label.shape)


# # Grid search

# In[3]:


version = "v07"


# In[4]:


# read model
with open("models/xgb_model_{}.mod".format(version), "rb") as input_file:
    res_model = pickle.load(input_file)
res_model.keys()


# In[5]:


meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"]
ls_features = [feat for feat, val in res_model["imp"]]
pdf_features_label = pdf_features_label[meta_cols + ls_features]
print("Selected features: {}".format(len(ls_features)))


# In[6]:


from sklearn.model_selection import GridSearchCV, StratifiedKFold
pdf_data = pdf_features_label[pdf_features_label["tvt_code"].isin(["train", "val", "test"])].copy()
pdf_data.shape


# In[ ]:


get_ipython().run_cell_magic('time', '', 'param_grid = {\n    "objective": ["binary:logistic"], \n    "booster": ["gbtree"], \n    "max_depth": [4, 7], # default: 3 only for depthwise\n    "n_estimators": [1000], # default: 500  \n    "learning_rate": [0.025], # default: 0.05 \n    "subsample": [0.6, 0.8], \n    "colsample_bytree": [0.6, 0.8],  # default:  1.0\n    "colsample_bylevel": [0.6, 0.8], # default: 1.0\n    "random_state": [1],\n    \'min_child_weight\': [11],\n\n    #\n    "silent": [True], \n    \'seed\': [1]\n}\n\nxgb_model = xgb.XGBClassifier()\ngrid_search = GridSearchCV(xgb_model, param_grid, n_jobs=16, \n                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=1), \n                   scoring=\'roc_auc\',\n                   verbose=2)\n\ngrid_result = grid_search.fit(pdf_data[ls_features], pdf_data["TARGET"])\n')


# In[15]:


print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# 
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
# 
# for check_param in ["max_depth", "subsample", "colsample_bytree", "colsample_bylevel"]:
#     plt.errorbar(param_grid[check_param], means, yerr=stds) 
#     plt.title("XGBoost {} vs AUC".format(check_param)) 
#     plt.xlabel(check_param)
#     plt.ylabel('AUC') 
#     plt.show()


# # Submission

# In[10]:


X_kaggle_test = pdf_features_label.query("tvt_code == 'kaggle_test'")[ls_features]
y_test_pred = grid_search.predict_proba(X_kaggle_test)[:, 1]
y_test_pred.mean()


# In[11]:


SK_IDs = pdf_features_label.query("tvt_code == 'kaggle_test'")["SK_ID_CURR"].tolist()
pdf_submiss = pd.DataFrame({"SK_ID_CURR": SK_IDs, "TARGET": y_test_pred})
pdf_submiss.to_csv("submissions/submission_gridsearch_{}.csv".format(version), index=False)
pdf_submiss.head()


# In[12]:


# save model to file
res_model = {
    "grid_search": grid_search,
    "grid_result": grid_result
}
pickle.dump(res_model, open("models/xgb_gridsearch_{}.mod".format(version), "wb"))