#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import os, pickle, re, collections import pandas as pd import numpy as np # import matplotlib.pyplot as plt from IPython.display import display from lib_modeling import * from xgboost import plot_importance # some settings for displaying Pandas results pd.set_option('display.width', 2000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.precision', 4) pd.set_option('display.max_colwidth', -1) # In[3]: version = "v07" # # Load combined features with label # In[4]: # pdf_features_label = pd.read_pickle(os.path.join("features", "pdf_features_label.pkl.bz2"), compression="bz2") pdf_features_label = pd.read_csv(os.path.join("../04_feature_engineering/features", "pdf_features_label.csv.bz2"), compression="bz2") meta_cols = ["SK_ID_CURR", "TARGET", "tvt_code"] ls_features = [cname for cname in pdf_features_label.columns if cname not in meta_cols] # print("Number of features: {}".format(len(ls_features))) print(pdf_features_label.shape) display(pdf_features_label.head().T) # In[5]: pdf_features_label["tvt_code"].value_counts() # # Modeling (advanced) # In[ ]: get_ipython().run_cell_magic('time', '', '# get train/val/test index list\nidx_train = pdf_features_label["tvt_code"] == "train"\nidx_test_list = [pdf_features_label["tvt_code"] == "val", pdf_features_label["tvt_code"] == "test"]\n\n#\nparam_init = {\n "objective": "binary:logistic", \n "booster": "gbtree", \n "max_depth": 4, # default: 3 only for depthwise\n "n_estimators": 1500, # default: 500 \n "learning_rate": 0.025, # default: 0.05 \n "subsample": 0.7, \n "colsample_bytree": 0.6, # default: 1.0\n "colsample_bylevel": 0.5, # default: 1.0\n \n #\n "silent": True, \n "n_jobs": 16, \n \n #\n "tree_method": "hist", # default: auto\n "grow_policy": "lossguide", # default depthwise\n}\n\nparam_fit = {\n "eval_metric": "auc", \n "early_stopping_rounds": 500, # default: 100\n "verbose": 200,\n}\n\noptions = {\n # turn to filter features\n "nturn": 4,\n \n # turn to run random state\n "auc_check_per_turn_n": 3,\n \n # drop per turn\n "ndrop_per_turn": 10\n}\n\nls_res_selection_info = feature_selection_steps(\n pdf_input=pdf_features_label, \n ls_features=ls_features, \n target_name="TARGET",\n target_posval=0,\n idx_train=idx_train,\n idx_test_list=idx_test_list, \n xgb_param_init=param_init, \n xgb_param_fit=param_fit,\n options=options,\n)\n') # In[ ]: i_max_turn = max(range(options["nturn"]), key=lambda i_res: ls_res_selection_info[i_res]["auc"][-1]) xgb_model_i = ls_res_selection_info[i_max_turn]["model"] ls_auc_i = ls_res_selection_info[i_max_turn]["auc"] ls_imp_i = ls_res_selection_info[i_max_turn]["imp"] print("AUC: {}".format(ls_auc_i)) # In[ ]: # save model to file res_model = ls_res_selection_info[i_max_turn] res_model["features"] = res_model["model"].get_booster().feature_names pickle.dump(res_model, open("models/xgb_model_{}.mod".format(version), "wb")) # # Model evaluates # In[ ]: # read model with open("models/xgb_model_{}.mod".format(version), "rb") as input_file: res_model = pickle.load(input_file) res_model.keys() # In[15]: # visualize_auc(pdf_features_label, "test", res_model) # In[16]: # fig_height = len(res_model["imp"]) / 4 fig, ax = plt.subplots(figsize=(10, fig_height)) plot_importance(res_model["model"], ax=ax) plt.show() # In[17]: pdf_imp = pd.DataFrame(res_model["imp"]) pdf_imp.rename(columns= {0: "feat_name", 1: "F-score"}, inplace=True) pdf_imp.head(50) # # Save submission # In[ ]: target_pred_posval = 0 ls_features = res_model["model"].get_booster().feature_names X_kaggle_test = pdf_features_label.query("tvt_code == 'kaggle_test'")[ls_features] y_test_pred = res_model["model"].predict_proba(X_kaggle_test)[:, target_pred_posval] y_test_pred.mean() # In[ ]: SK_IDs = pdf_features_label.query("tvt_code == 'kaggle_test'")["SK_ID_CURR"].tolist() pdf_submiss = pd.DataFrame({"SK_ID_CURR": SK_IDs, "TARGET": y_test_pred}) pdf_submiss.to_csv("submissions/submission_{}.csv".format(version), index=False) pdf_submiss.head() # ![submission_baseline_v02](submissions/submission_baseline_v02.png "submission_baseline_v02") # ![submission_baseline_v01](submissions/submission_baseline_v01.png "submission_baseline_v01")