#!/usr/bin/env python # coding: utf-8 # In[1]: import os, math, subprocess import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from IPython.display import display from lib_feature_engineering import * # some settings for displaying Pandas results pd.set_option('display.width', 2000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.precision', 4) pd.set_option('display.max_colwidth', -1) # # Load train and test data # In[2]: # train train_path = "home-credit-default-risk/application_train.csv" pdf_train = pd.read_csv(train_path) print("(rows, columns)", pdf_train.shape) print("First 5 rows") display(pdf_train.head(5)) # test test_path = "home-credit-default-risk/application_test.csv" pdf_test = pd.read_csv(test_path) print("(rows, columns)", pdf_test.shape) print("First 5 rows") display(pdf_test.head(5)) # load meta data meta_path = "../02_pandas/reports/report_application_train.csv" pdf_meta = pd.read_csv(meta_path) display(pdf_meta) # In[3]: # filter by tvt code pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2") pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'") .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR") .drop(columns=["tvt_code"])) pdf_train_filtered.head() # # Preprocessing # ## NAME_TYPE_SUITE: Người đi cùng khi đi vay # In[4]: pdf_train['NAME_TYPE_SUITE'].isna().sum() # In[5]: pdf_train['NAME_TYPE_SUITE'].value_counts() # In[6]: # Có 1292 giá trị null, vì đa số là Unaccompanied, nên sẽ fillna là Unaccomanied pdf_train['NAME_TYPE_SUITE'].fillna(value='Unaccompanied', inplace=True) pdf_test['NAME_TYPE_SUITE'].fillna(value='Unaccompanied', inplace=True) # ## OWN_CAR_AGE: Tuổi thọ của chiếc xe # In[7]: pdf_train['OWN_CAR_AGE'].isna().mean() # In[8]: pdf_train[(pdf_train["OWN_CAR_AGE"].isna()) & (pdf_train["FLAG_OWN_CAR"] == "Y")].shape # In[9]: pdf_train[(pdf_train["OWN_CAR_AGE"].isna()) & (pdf_train["FLAG_OWN_CAR"] == "N")].shape # In[10]: # 65% không điền thông tin, những người không có xe thì sẽ không điền, nên fillna là 0 pdf_train['OWN_CAR_AGE'].fillna(value=0, inplace=True) pdf_test['OWN_CAR_AGE'].fillna(value=0, inplace=True) # ## Percent credit and income # In[11]: # AMT_CREDIT: Số tiền vay # AMT_INCOME_TOTAL: Thu nhập của khách hàng # AMT_ANNUITY: Số tiền phải trả hàng năm pdf_train['CREDIT_INCOME_PERCENT'] = pdf_train['AMT_CREDIT'] / pdf_train['AMT_INCOME_TOTAL'] pdf_train['ANNUITY_INCOME_PERCENT'] = pdf_train['AMT_ANNUITY'] / pdf_train['AMT_INCOME_TOTAL'] pdf_train['CREDIT_TERM'] = pdf_train['AMT_ANNUITY'] / pdf_train['AMT_CREDIT'] pdf_test['CREDIT_INCOME_PERCENT'] = pdf_test['AMT_CREDIT'] / pdf_test['AMT_INCOME_TOTAL'] pdf_test['ANNUITY_INCOME_PERCENT'] = pdf_test['AMT_ANNUITY'] / pdf_test['AMT_INCOME_TOTAL'] pdf_test['CREDIT_TERM'] = pdf_test['AMT_ANNUITY'] / pdf_test['AMT_CREDIT'] # # Binary vs one-hot encoding features # In[12]: get_ipython().run_cell_magic('time', '', 'def gen_binary_one_hot_feat(pdf_input):\n\n pdf_data = pdf_input.copy()\n select_features = []\n dict_feat = {\n "binary_default": {\n "NAME_CONTRACT_TYPE": [\'Cash loans\', \'Revolving loans\'],\n "CODE_GENDER": [\'M\', \'F\', \'XNA\'],\n "FLAG_OWN_CAR": [\'Y\', \'N\'],\n "FLAG_OWN_REALTY": [\'Y\', \'N\'],\n "EMERGENCYSTATE_MODE": [\'Yes\', \'No\'],\n },\n "binary": [\n "FLAG_EMP_PHONE",\n "FLAG_WORK_PHONE",\n "FLAG_PHONE",\n "FLAG_EMAIL",\n "REG_REGION_NOT_LIVE_REGION",\n "REG_REGION_NOT_WORK_REGION",\n "LIVE_REGION_NOT_WORK_REGION",\n "REG_CITY_NOT_WORK_CITY",\n "LIVE_CITY_NOT_WORK_CITY",\n "FLAG_DOCUMENT_3",\n "FLAG_DOCUMENT_5",\n "FLAG_DOCUMENT_6",\n "FLAG_DOCUMENT_8",\n "FLAG_DOCUMENT_9",\n "REGION_RATING_CLIENT",\n "REGION_RATING_CLIENT_W_CITY",\n ],\n "onehot": {\n "NAME_TYPE_SUITE": ["Unaccompanied", "Family", "Spouse, partner", "Children", "Other_A", "Other_B", "Group of people"],\n "NAME_INCOME_TYPE": ["Working", "State servant", "Commercial associate", "Pensioner", "Unemployed", "Student", "Businessman", "Maternity leave"],\n "NAME_EDUCATION_TYPE": ["Secondary / secondary special", "Higher education", "Incomplete higher", "Lower secondary", "Academic degree"],\n "NAME_FAMILY_STATUS": ["Single / not married", "Married", "Civil marriage", "Widow", "Separated", "Unknown"],\n "NAME_HOUSING_TYPE": ["House / apartment", "Rented apartment", "With parents", "Municipal apartment", "Office apartment", "Co-op apartment"],\n "OCCUPATION_TYPE": ["Laborers", "Core staff", "Accountants", "Managers", "Drivers", "Sales staff", "Cleaning staff", "Cooking staff", "Private service staff", "Medicine staff", "Security staff", "High skill tech staff", "Waiters/barmen staff", "Low-skill Laborers", "Realty agents", "Secretaries", "IT staff", "HR staff"],\n "ORGANIZATION_TYPE": ["Business Entity Type 3", "School", "Government", "Religion", "Other", "XNA", "Electricity", "Medicine", "Business Entity Type 2", "Self-employed", "Transport: type 2", "Construction", "Housing", "Kindergarten", "Trade: type 7", "Industry: type 11", "Military", "Services", "Security Ministries", "Transport: type 4", "Industry: type 1", "Emergency", "Security", "Trade: type 2", "University", "Transport: type 3", "Police", "Business Entity Type 1", "Postal", "Industry: type 4", "Agriculture", "Restaurant", "Culture", "Hotel", "Industry: type 7", "Trade: type 3", "Industry: type 3", "Bank", "Industry: type 9", "Insurance", "Trade: type 6", "Industry: type 2", "Transport: type 1", "Industry: type 12", "Mobile", "Trade: type 1", "Industry: type 5", "Industry: type 10", "Legal Services", "Advertising", "Trade: type 5", "Cleaning", "Industry: type 13", "Trade: type 4", "Telecom", "Industry: type 8", "Realtor", "Industry: type 6"],\n "FONDKAPREMONT_MODE": ["reg oper account", "org spec account", "reg oper spec account", "not specified"],\n "HOUSETYPE_MODE": ["block of flats", "terraced house", "specific housing"],\n "WALLSMATERIAL_MODE": ["Stone, brick", "Block", "Panel", "Mixed", "Wooden", "Others", "Monolithic"], \n }\n }\n\n for k in dict_feat:\n if k == "binary_default":\n for cname in dict_feat[k]:\n # get default value\n default_val = dict_feat[k][cname][0]\n\n # convert category to binary\n feat_name = "is_" + cname\n select_features.append(feat_name)\n pdf_data[feat_name] = pdf_data[cname].apply(lambda x: int(x == default_val))\n elif k == "binary":\n # rename only\n for cname in dict_feat[k]:\n feat_name = "is_" + cname\n select_features.append(feat_name)\n pdf_data[feat_name] = pdf_data[cname]\n elif k == "onehot":\n for cname in dict_feat[k]:\n ls_vals = dict_feat[k][cname]\n for val in ls_vals:\n try:\n new_name = "{}_{}".format(cname, val.replace(" ", "_")\\\n .replace(":", "_")\\\n .replace("/", "_")\\\n .replace("-", "_"))\n\n select_features.append(new_name)\n pdf_data[new_name] = pdf_data[cname].apply(lambda x: int(x == val))\n except Exception as err:\n print("One hot for {}-{}. Error: {}".format(cname, val, err)) \n \n return pdf_data[["SK_ID_CURR"] + select_features]\n\n# for train feat\npdf01_baseline = gen_binary_one_hot_feat(pdf_train)\n\n# for test feat\npdf02_baseline = gen_binary_one_hot_feat(pdf_test)\n\n# print results\nprint(pdf01_baseline.shape, pdf02_baseline.shape)\ndisplay(pdf01_baseline.head().T)\n') # In[13]: eval_agg01 = feature_evaluate(pdf_train_filtered, pdf01_baseline) display(eval_agg01) # In[14]: eval_agg01.query("auc <= 0.501").shape # In[15]: sel_feat = eval_agg01.query("auc > 0.501")["name"].tolist() # for train pdf01_baseline = pdf01_baseline[["SK_ID_CURR"] + sel_feat] print(pdf01_baseline.shape) # for test pdf02_baseline = pdf02_baseline[["SK_ID_CURR"] + sel_feat] print(pdf02_baseline.shape) # # keep columns # In[16]: # Due to add more columns for "Percent credit and income" so we cannot use info from meta data # ls_continuous_name = pdf_meta[pdf_meta["sub_type"] == "float64"]["name"].tolist() s_dtype = pdf_train.dtypes ls_continuous_name = s_dtype[s_dtype == "float64"].index.tolist() print(len(ls_continuous_name)) # In[17]: # for train feat pdf11_baseline = pdf_train[["SK_ID_CURR"] + ls_continuous_name].copy() # for test feat pdf12_baseline = pdf_test[["SK_ID_CURR"] + ls_continuous_name].copy() # print results print(pdf11_baseline.shape, pdf12_baseline.shape) display(pdf11_baseline.head().T) # In[18]: eval_agg02 = feature_evaluate(pdf_train_filtered, pdf11_baseline) display(eval_agg02) # In[19]: eval_agg02.query("auc <= 0.501").shape # ## convert days to years # In[20]: # The numbers in the DAYS_BIRTH column are negative because they are recorded relative to the current loan application. # To see these stats in years, we can mutliple by -1 and divide by the number of days in a year: pdf11_baseline["YEARS_BIRTH"] = pdf_train["DAYS_BIRTH"] / -365 pdf12_baseline["YEARS_BIRTH"] = pdf_test["DAYS_BIRTH"] / -365 # similarly pdf11_baseline["REGISTRATION_YEAR"] = pdf_train["DAYS_REGISTRATION"] / -365 pdf12_baseline["REGISTRATION_YEAR"] = pdf_test["DAYS_REGISTRATION"] / -365 # similarly pdf11_baseline["ID_PUBLISH_YEAR"] = pdf_train["DAYS_ID_PUBLISH"] / -365 pdf12_baseline["ID_PUBLISH_YEAR"] = pdf_test["DAYS_ID_PUBLISH"] / -365 # similarly pdf11_baseline["LAST_PHONE_CHANGE_YEAR"] = pdf_train["DAYS_LAST_PHONE_CHANGE"] / -365 pdf12_baseline["LAST_PHONE_CHANGE_YEAR"] = pdf_test["DAYS_LAST_PHONE_CHANGE"] / -365 # # handling DAYS_EMPLOYED # In[21]: pdf_train["DAYS_EMPLOYED"].describe() # In[22]: pdf_train["DAYS_EMPLOYED"].hist() plt.show() # In[23]: # check anomaly anom = pdf_train[pdf_train["DAYS_EMPLOYED"] == 365243] non_anom = pdf_train[pdf_train["DAYS_EMPLOYED"] != 365243] print("Tỉ lệ phần trăm TARGET của non-anomalies: {}".format(100 * non_anom["TARGET"].mean())) print("Tỉ lệ phần trăm TARGET của anomalies: {}".format(100 * anom["TARGET"].mean())) print("Số lượng anomalies là {}".format(len(anom))) # In[24]: def handling_days_employed(pdf_input, pdf_output): # Create an anomalous flag column pdf_output["DAYS_EMPLOYED_ANOM"] = pdf_input["DAYS_EMPLOYED"] == 365243 # Replace the anomalous values with nan pdf_output["DAYS_EMPLOYED"] = pdf_input["DAYS_EMPLOYED"].replace({365243: np.nan}) # Calculate years employed pdf_output["YEARS_EMPLOYED"] = pdf_output["DAYS_EMPLOYED"] / -365 # percent employed over years of birth pdf_output['YEARS_EMPLOYED_PERCENT'] = pdf_output['YEARS_EMPLOYED'] / pdf_output['YEARS_BIRTH'] return pdf_output pdf11_baseline = handling_days_employed(pdf_train, pdf11_baseline) pdf12_baseline = handling_days_employed(pdf_test, pdf12_baseline) # # save features # In[25]: get_ipython().run_cell_magic('time', '', 'def store_features(pdf_train, pdf_test, fname):\n print(pdf_train.shape, pdf_test.shape)\n fname = os.path.join("features", "{}.pkl.bz2".format(fname))\n pdf_out = pd.concat([pdf_train, pdf_test]).reset_index(drop=True)\n pdf_out.to_pickle(fname, compression="bz2")\n print("Store features completed!")\n\nstore_features(pdf01_baseline, pdf02_baseline, "baseline")\nstore_features(pdf11_baseline, pdf12_baseline, "baseline_extend")\n')