#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os, math, subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from IPython.display import display
from lib_feature_engineering import *

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', -1)


# # Load train and test data

# In[2]:


# train
train_path = "home-credit-default-risk/application_train.csv"
pdf_train = pd.read_csv(train_path)
print("(rows, columns)", pdf_train.shape)
print("First 5 rows")
display(pdf_train.head(5))

# test
test_path = "home-credit-default-risk/application_test.csv"
pdf_test = pd.read_csv(test_path)
print("(rows, columns)", pdf_test.shape)
print("First 5 rows")
display(pdf_test.head(5))

# load meta data
meta_path = "../02_pandas/reports/report_application_train.csv"
pdf_meta = pd.read_csv(meta_path)
display(pdf_meta)


# In[3]:


# filter by tvt code
pdf_tvt_extend = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
pdf_train_filtered = (pdf_tvt_extend.query("tvt_code == 'train'")
                      .merge(pdf_train[["SK_ID_CURR"]], on="SK_ID_CURR")
                      .drop(columns=["tvt_code"]))
pdf_train_filtered.head()


# # Preprocessing

# ## NAME_TYPE_SUITE: Người đi cùng khi đi vay

# In[4]:


pdf_train['NAME_TYPE_SUITE'].isna().sum()


# In[5]:


pdf_train['NAME_TYPE_SUITE'].value_counts()


# In[6]:


# Có 1292 giá trị null, vì đa số là Unaccompanied, nên sẽ fillna là Unaccomanied
pdf_train['NAME_TYPE_SUITE'].fillna(value='Unaccompanied', inplace=True)
pdf_test['NAME_TYPE_SUITE'].fillna(value='Unaccompanied', inplace=True)


# ## OWN_CAR_AGE: Tuổi thọ của chiếc xe

# In[7]:


pdf_train['OWN_CAR_AGE'].isna().mean()


# In[8]:


pdf_train[(pdf_train["OWN_CAR_AGE"].isna()) & (pdf_train["FLAG_OWN_CAR"] == "Y")].shape


# In[9]:


pdf_train[(pdf_train["OWN_CAR_AGE"].isna()) & (pdf_train["FLAG_OWN_CAR"] == "N")].shape


# In[10]:


# 65% không điền thông tin, những người không có xe thì sẽ không điền, nên fillna là 0
pdf_train['OWN_CAR_AGE'].fillna(value=0, inplace=True)
pdf_test['OWN_CAR_AGE'].fillna(value=0, inplace=True)


# ## Percent credit and income

# In[11]:


# AMT_CREDIT: Số tiền vay
# AMT_INCOME_TOTAL: Thu nhập của khách hàng
# AMT_ANNUITY: Số tiền phải trả hàng năm
pdf_train['CREDIT_INCOME_PERCENT'] = pdf_train['AMT_CREDIT'] / pdf_train['AMT_INCOME_TOTAL']
pdf_train['ANNUITY_INCOME_PERCENT'] = pdf_train['AMT_ANNUITY'] / pdf_train['AMT_INCOME_TOTAL']
pdf_train['CREDIT_TERM'] = pdf_train['AMT_ANNUITY'] / pdf_train['AMT_CREDIT']

pdf_test['CREDIT_INCOME_PERCENT'] = pdf_test['AMT_CREDIT'] / pdf_test['AMT_INCOME_TOTAL']
pdf_test['ANNUITY_INCOME_PERCENT'] = pdf_test['AMT_ANNUITY'] / pdf_test['AMT_INCOME_TOTAL']
pdf_test['CREDIT_TERM'] = pdf_test['AMT_ANNUITY'] / pdf_test['AMT_CREDIT']


# # Binary vs one-hot encoding features

# In[12]:


get_ipython().run_cell_magic('time', '', 'def gen_binary_one_hot_feat(pdf_input):\n\n    pdf_data = pdf_input.copy()\n    select_features = []\n    dict_feat = {\n        "binary_default": {\n            "NAME_CONTRACT_TYPE": [\'Cash loans\', \'Revolving loans\'],\n            "CODE_GENDER": [\'M\', \'F\', \'XNA\'],\n            "FLAG_OWN_CAR": [\'Y\', \'N\'],\n            "FLAG_OWN_REALTY": [\'Y\', \'N\'],\n            "EMERGENCYSTATE_MODE": [\'Yes\', \'No\'],\n        },\n        "binary": [\n            "FLAG_EMP_PHONE",\n            "FLAG_WORK_PHONE",\n            "FLAG_PHONE",\n            "FLAG_EMAIL",\n            "REG_REGION_NOT_LIVE_REGION",\n            "REG_REGION_NOT_WORK_REGION",\n            "LIVE_REGION_NOT_WORK_REGION",\n            "REG_CITY_NOT_WORK_CITY",\n            "LIVE_CITY_NOT_WORK_CITY",\n            "FLAG_DOCUMENT_3",\n            "FLAG_DOCUMENT_5",\n            "FLAG_DOCUMENT_6",\n            "FLAG_DOCUMENT_8",\n            "FLAG_DOCUMENT_9",\n            "REGION_RATING_CLIENT",\n            "REGION_RATING_CLIENT_W_CITY",\n        ],\n        "onehot": {\n            "NAME_TYPE_SUITE": ["Unaccompanied", "Family", "Spouse, partner", "Children", "Other_A", "Other_B", "Group of people"],\n            "NAME_INCOME_TYPE": ["Working", "State servant", "Commercial associate", "Pensioner", "Unemployed", "Student", "Businessman", "Maternity leave"],\n            "NAME_EDUCATION_TYPE": ["Secondary / secondary special", "Higher education", "Incomplete higher", "Lower secondary", "Academic degree"],\n            "NAME_FAMILY_STATUS": ["Single / not married", "Married", "Civil marriage", "Widow", "Separated", "Unknown"],\n            "NAME_HOUSING_TYPE": ["House / apartment", "Rented apartment", "With parents", "Municipal apartment", "Office apartment", "Co-op apartment"],\n            "OCCUPATION_TYPE": ["Laborers", "Core staff", "Accountants", "Managers", "Drivers", "Sales staff", "Cleaning staff", "Cooking staff", "Private service staff", "Medicine staff", "Security staff", "High skill tech staff", "Waiters/barmen staff", "Low-skill Laborers", "Realty agents", "Secretaries", "IT staff", "HR staff"],\n            "ORGANIZATION_TYPE": ["Business Entity Type 3", "School", "Government", "Religion", "Other", "XNA", "Electricity", "Medicine", "Business Entity Type 2", "Self-employed", "Transport: type 2", "Construction", "Housing", "Kindergarten", "Trade: type 7", "Industry: type 11", "Military", "Services", "Security Ministries", "Transport: type 4", "Industry: type 1", "Emergency", "Security", "Trade: type 2", "University", "Transport: type 3", "Police", "Business Entity Type 1", "Postal", "Industry: type 4", "Agriculture", "Restaurant", "Culture", "Hotel", "Industry: type 7", "Trade: type 3", "Industry: type 3", "Bank", "Industry: type 9", "Insurance", "Trade: type 6", "Industry: type 2", "Transport: type 1", "Industry: type 12", "Mobile", "Trade: type 1", "Industry: type 5", "Industry: type 10", "Legal Services", "Advertising", "Trade: type 5", "Cleaning", "Industry: type 13", "Trade: type 4", "Telecom", "Industry: type 8", "Realtor", "Industry: type 6"],\n            "FONDKAPREMONT_MODE": ["reg oper account", "org spec account", "reg oper spec account", "not specified"],\n            "HOUSETYPE_MODE": ["block of flats", "terraced house", "specific housing"],\n            "WALLSMATERIAL_MODE": ["Stone, brick", "Block", "Panel", "Mixed", "Wooden", "Others", "Monolithic"],            \n        }\n    }\n\n    for k in dict_feat:\n        if k == "binary_default":\n            for cname in dict_feat[k]:\n                # get default value\n                default_val = dict_feat[k][cname][0]\n\n                # convert category to binary\n                feat_name = "is_" + cname\n                select_features.append(feat_name)\n                pdf_data[feat_name] = pdf_data[cname].apply(lambda x: int(x == default_val))\n        elif k == "binary":\n            # rename only\n            for cname in dict_feat[k]:\n                feat_name = "is_" + cname\n                select_features.append(feat_name)\n                pdf_data[feat_name] = pdf_data[cname]\n        elif k == "onehot":\n            for cname in dict_feat[k]:\n                ls_vals = dict_feat[k][cname]\n                for val in ls_vals:\n                    try:\n                        new_name = "{}_{}".format(cname, val.replace(" ", "_")\\\n                                                              .replace(":", "_")\\\n                                                              .replace("/", "_")\\\n                                                              .replace("-", "_"))\n\n                        select_features.append(new_name)\n                        pdf_data[new_name] = pdf_data[cname].apply(lambda x: int(x == val))\n                    except Exception as err:\n                        print("One hot for {}-{}. Error: {}".format(cname, val, err))                        \n                \n    return pdf_data[["SK_ID_CURR"] + select_features]\n\n# for train feat\npdf01_baseline = gen_binary_one_hot_feat(pdf_train)\n\n# for test feat\npdf02_baseline = gen_binary_one_hot_feat(pdf_test)\n\n# print results\nprint(pdf01_baseline.shape, pdf02_baseline.shape)\ndisplay(pdf01_baseline.head().T)\n')


# In[13]:


eval_agg01 = feature_evaluate(pdf_train_filtered, pdf01_baseline)
display(eval_agg01)


# In[14]:


eval_agg01.query("auc <= 0.501").shape


# In[15]:


sel_feat = eval_agg01.query("auc > 0.501")["name"].tolist()

# for train
pdf01_baseline = pdf01_baseline[["SK_ID_CURR"] + sel_feat]
print(pdf01_baseline.shape)

# for test
pdf02_baseline = pdf02_baseline[["SK_ID_CURR"] + sel_feat]
print(pdf02_baseline.shape)


# # keep columns

# In[16]:


# Due to add more columns for "Percent credit and income" so we cannot use info from meta data
# ls_continuous_name = pdf_meta[pdf_meta["sub_type"] == "float64"]["name"].tolist()
s_dtype = pdf_train.dtypes
ls_continuous_name = s_dtype[s_dtype == "float64"].index.tolist()
print(len(ls_continuous_name))


# In[17]:


# for train feat
pdf11_baseline = pdf_train[["SK_ID_CURR"] + ls_continuous_name].copy()

# for test feat
pdf12_baseline = pdf_test[["SK_ID_CURR"] + ls_continuous_name].copy()

# print results
print(pdf11_baseline.shape, pdf12_baseline.shape)
display(pdf11_baseline.head().T)


# In[18]:


eval_agg02 = feature_evaluate(pdf_train_filtered, pdf11_baseline)
display(eval_agg02)


# In[19]:


eval_agg02.query("auc <= 0.501").shape


# ## convert days to years

# In[20]:


# The numbers in the DAYS_BIRTH column are negative because they are recorded relative to the current loan application. 
# To see these stats in years, we can mutliple by -1 and divide by the number of days in a year:
pdf11_baseline["YEARS_BIRTH"] = pdf_train["DAYS_BIRTH"] / -365
pdf12_baseline["YEARS_BIRTH"] = pdf_test["DAYS_BIRTH"] / -365

# similarly
pdf11_baseline["REGISTRATION_YEAR"] = pdf_train["DAYS_REGISTRATION"] / -365
pdf12_baseline["REGISTRATION_YEAR"] = pdf_test["DAYS_REGISTRATION"] / -365

# similarly
pdf11_baseline["ID_PUBLISH_YEAR"] = pdf_train["DAYS_ID_PUBLISH"] / -365
pdf12_baseline["ID_PUBLISH_YEAR"] = pdf_test["DAYS_ID_PUBLISH"] / -365

# similarly
pdf11_baseline["LAST_PHONE_CHANGE_YEAR"] = pdf_train["DAYS_LAST_PHONE_CHANGE"] / -365
pdf12_baseline["LAST_PHONE_CHANGE_YEAR"] = pdf_test["DAYS_LAST_PHONE_CHANGE"] / -365


# # handling DAYS_EMPLOYED

# In[21]:


pdf_train["DAYS_EMPLOYED"].describe()


# In[22]:


pdf_train["DAYS_EMPLOYED"].hist()
plt.show()


# In[23]:


# check anomaly
anom = pdf_train[pdf_train["DAYS_EMPLOYED"] == 365243]
non_anom = pdf_train[pdf_train["DAYS_EMPLOYED"] != 365243]
print("Tỉ lệ phần trăm TARGET của non-anomalies: {}".format(100 * non_anom["TARGET"].mean()))
print("Tỉ lệ phần trăm TARGET của anomalies: {}".format(100 * anom["TARGET"].mean()))
print("Số lượng anomalies là {}".format(len(anom)))


# In[24]:


def handling_days_employed(pdf_input, pdf_output):
    # Create an anomalous flag column
    pdf_output["DAYS_EMPLOYED_ANOM"] = pdf_input["DAYS_EMPLOYED"] == 365243

    # Replace the anomalous values with nan
    pdf_output["DAYS_EMPLOYED"] = pdf_input["DAYS_EMPLOYED"].replace({365243: np.nan})

    # Calculate years employed
    pdf_output["YEARS_EMPLOYED"] = pdf_output["DAYS_EMPLOYED"] / -365
    
    # percent employed over years of birth
    pdf_output['YEARS_EMPLOYED_PERCENT'] = pdf_output['YEARS_EMPLOYED'] / pdf_output['YEARS_BIRTH']

    return pdf_output


pdf11_baseline = handling_days_employed(pdf_train, pdf11_baseline)
pdf12_baseline = handling_days_employed(pdf_test, pdf12_baseline)


# # save features

# In[25]:


get_ipython().run_cell_magic('time', '', 'def store_features(pdf_train, pdf_test, fname):\n    print(pdf_train.shape, pdf_test.shape)\n    fname = os.path.join("features", "{}.pkl.bz2".format(fname))\n    pdf_out = pd.concat([pdf_train, pdf_test]).reset_index(drop=True)\n    pdf_out.to_pickle(fname, compression="bz2")\n    print("Store features completed!")\n\nstore_features(pdf01_baseline, pdf02_baseline, "baseline")\nstore_features(pdf11_baseline, pdf12_baseline, "baseline_extend")\n')