#!/usr/bin/env python
# coding: utf-8

# In[1]:


from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from statsmodels.stats.outliers_influence import variance_inflation_factor
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import seaborn as sns
import xgboost as xgb


# In[2]:


df = pd.read_csv('/kaggle/input/credit-card/application_data.csv')


# In[3]:


# Drop irrelevant columns
print(df.columns.tolist())


# I found that a lot of these columns did not pertain to fraud detection

# In[4]:


columns_to_remove = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG',
                     'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE',
                     'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE',
                     'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI',
                     'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI',
                     'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
                     'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6',
                     'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 
                     'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15','HOUR_APPR_PROCESS_START']

df = df.drop(columns_to_remove, axis=1)


# In[5]:


# Percentage of Missing Values


# Set the pandas option to display all rows
pd.set_option('display.max_rows', None)

# Calculate the percentage of missing values in each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Sort the missing_percentage Series in descending order
missing_percentage_sorted = missing_percentage.sort_values(ascending=False)

# Print the sorted missing_percentage Series
print(missing_percentage_sorted)

# Reset the pandas option to the default value
pd.reset_option('display.max_rows')


# For columns with a very low percentage of missing values (e.g., AMT_GOODS_PRICE, AMT_ANNUITY, CNT_FAM_MEMBERS, DAYS_LAST_PHONE_CHANGE), you can impute the missing values using the mean, median, or mode.
# 

# In[6]:


columns_to_impute = ['AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS', 'DAYS_LAST_PHONE_CHANGE']

for col in columns_to_impute:
    mode = df[col].mode().iloc[0]
    df[col].fillna(mode, inplace=True)


# For categorical variables with missing values (e.g., OCCUPATION_TYPE, NAME_TYPE_SUITE), I will using mode imputation
# 

# In[7]:


columns_to_impute = ['AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS', 'DAYS_LAST_PHONE_CHANGE']

for col in columns_to_impute:
    mode = df[col].mode().iloc[0]
    df[col].fillna(mode, inplace=True)


# For categorical variables with missing values (e.g., OCCUPATION_TYPE, NAME_TYPE_SUITE),I am once again using mode imputation
# 

# In[8]:


categorical_columns = ['OCCUPATION_TYPE', 'NAME_TYPE_SUITE']
for col in categorical_columns:
    mode = df[col].mode().iloc[0]
    df[col].fillna(mode, inplace=True)


# For columns related to the Credit Bureau (e.g., AMT_REQ_CREDIT_BUREAU_HOUR, AMT_REQ_CREDIT_BUREAU_DAY, AMT_REQ_CREDIT_BUREAU_WEEK, AMT_REQ_CREDIT_BUREAU_MON, AMT_REQ_CREDIT_BUREAU_QRT, AMT_REQ_CREDIT_BUREAU_YEAR), I will fill missing values with 0, assuming no inquiries were made

# In[9]:


credit_bureau_columns = ['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

for col in credit_bureau_columns:
    df[col].fillna(0, inplace = True)


# In[10]:


# XNAs are also considered null values
# Find the number of XNA in each column

xna_counts_per_column = df.applymap(lambda x: x == 'XNA').sum()
print(xna_counts_per_column.sort_values(ascending= False))


# In[11]:


# Too many XNA;s in the ORGANIZATION_TYPE column, so we will drop it

df = df.drop('ORGANIZATION_TYPE', axis =1)


# In[12]:


# We have 4 XNA values in the gender column, find the dominant gender and replace XNA with that gender

gender_counts = df['CODE_GENDER'].value_counts()

# Calculate the total number of male and female entries
total_gender_count = gender_counts.sum()

# Calculate the percentage of males and females
male_percentage = (gender_counts['M'] / total_gender_count) * 100
female_percentage = (gender_counts['F'] / total_gender_count) * 100

print(male_percentage)
print(female_percentage)


# In[13]:


df['CODE_GENDER'] = df['CODE_GENDER'].replace('XNA', 'F')


# I want to use PCA for dimensionality reduction, however it only works with numerical data types, and no missing values. More preprocesssing before applying PCA

# In[14]:


# Remove missing values in numerical column
numeric_columns = df.select_dtypes(include=[np.number])

# Calculate the number of missing values for each numeric column
missing_values_count = numeric_columns.isnull().sum()

# Sort the missing values count from high to low
sorted_missing_values_count = missing_values_count.sort_values(ascending=False)

# Print the sorted number of missing values for each numeric column
print("Number of missing values for each numeric column (sorted from high to low):")
print(sorted_missing_values_count)


# In[15]:


df = df.drop('OWN_CAR_AGE', axis =1)


# In[16]:


correlation_matrix = df.corr()
plt.figure(figsize=(32, 24))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()


# In[17]:


# Set a threshold for high correlation
threshold = 0.7

# Create a mask to filter out self-correlations (correlation of a variable with itself)
mask = correlation_matrix != 1

# Apply the mask to the correlation matrix
filtered_corr_matrix = correlation_matrix[mask]

# Find the pairs of variables with correlation greater than the threshold
high_correlations = filtered_corr_matrix[filtered_corr_matrix.abs() > threshold].stack().reset_index()

# Rename the columns in the high_correlations DataFrame
high_correlations.columns = ['Variable 1', 'Variable 2', 'Correlation']

# Sort the DataFrame by the absolute value of the correlation in descending order
high_correlations['Abs Correlation'] = high_correlations['Correlation'].abs()
high_correlations = high_correlations.sort_values(by='Abs Correlation', ascending=False).drop('Abs Correlation', axis=1)

print(high_correlations)
print(df.columns.tolist())


# In[18]:


# Check for multicollinearity by calculating the VIF

selected_columns = df[['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                       'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE',
                       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
                       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE',
                       'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
                       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
                       'LIVE_CITY_NOT_WORK_CITY', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 
                       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']]

# Keep only numeric columns
selected_columns_numeric = selected_columns.select_dtypes(include=[np.number])

# Calculate VIF for each numeric independent variable
vif_data = pd.DataFrame()
vif_data['Feature'] = selected_columns_numeric.columns
vif_data['VIF'] = [variance_inflation_factor(selected_columns_numeric.values, i) for i in range(selected_columns_numeric.shape[1])]

# Print VIF values
vif_data_sorted = vif_data.sort_values(by='VIF', ascending=False)
print(vif_data_sorted)


# In[19]:


# Remove values with high VIF
high_vif_remove = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'DAYS_EMPLOYED', 'FLAG_CONT_MOBILE', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'CNT_FAM_MEMBERS', 'DAYS_BIRTH']

df = df.drop(high_vif_remove, axis=1)


# In[20]:


# Feature engineering

# Determine the riskiness of the loans
df['LTV_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']

df['LTI_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']


# We have an imbalanced dataset, let's try some of these strategies to make sure that the imbalanced dataset gives us more precise predictions
# 
# 1. Undersampling: Randomly remove instances from the majority class (in this case, non-fraudulent clients) to balance the dataset. This method can lead to a loss of information, as you're discarding potentially useful data.
# 
# 2. Oversampling: Randomly duplicate instances from the minority class (in this case, fraudulent clients) to balance the dataset. This method can lead to overfitting, as it's creating copies of the same data points.2. 

# In[21]:


# Separate majority and minority classes
df_majority = df[df.TARGET == 0]
df_minority = df[df.TARGET == 1]

# Downsample the majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority),
                                   random_state=42)

# Combine the downsampled majority class and the minority class
df_undersampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
print(df_undersampled.TARGET.value_counts())


# In[22]:


# Oversampling 

# Upsample the minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

# Combine the upsampled minority class and the majority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
print(df_upsampled.TARGET.value_counts())


# In[23]:


# Separate the features and the target variable
X = df_undersampled.drop('TARGET', axis=1)
y = df_undersampled['TARGET']

# Get the columns that contain categorical variables
cat_cols = X.select_dtypes(include=['object']).columns

# Apply one-hot encoding to the categorical variables
X = pd.get_dummies(X, columns=cat_cols)

# Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Create a PCA object and fit it to the standardized features
pca = PCA()
pca.fit(X_std)


# In[24]:


# Plot the explained variance ratio
plt.plot(range(1,len(pca.explained_variance_ratio_)+1),pca.explained_variance_ratio_.cumsum())
plt.title('Cumulative Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.show()


# In[25]:


# PCA for Undersampled Data

# Select only numeric columns
X_numeric = df_undersampled.select_dtypes(include=[np.number])

# Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X_numeric)

# Create a PCA object and fit it to the standardized features
pca = PCA()
pca.fit(X_std)

# Plot the explained variance ratio
plt.plot(range(1,len(pca.explained_variance_ratio_)+1),pca.explained_variance_ratio_.cumsum())
plt.title('Cumulative Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.show()


# In[26]:


# PCA for Oversampled Data
# Get the columns that contain categorical variables
cat_cols = df_upsampled.select_dtypes(include=['object']).columns

# Apply one-hot encoding to the categorical variables
df_upsampled = pd.get_dummies(df_upsampled, columns=cat_cols)

# Separate the features and the target variable
X = df_upsampled.drop('TARGET', axis=1)
y = df_upsampled['TARGET']

# Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# Create a PCA object and fit it to the standardized features
pca = PCA()
pca.fit(X_std)

# Plot the explained variance ratio
plt.plot(range(1,len(pca.explained_variance_ratio_)+1),pca.explained_variance_ratio_.cumsum())
plt.title('Cumulative Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.show()


# In[27]:


# Model training
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


# Split the data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary set into validation and testing sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


# Train an XGBoost model
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)

# Evaluate the model on the testing data
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


# In[28]:


# Define XGBoost classifier with default hyperparameters
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', early_stopping_rounds=10)

# Define the range of hyperparameters to search over
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [100, 500, 1000],
    'gamma': [0, 0.1, 1]
}


# I will be hypertunning parameters. PLEASE DO NOT RUN THIS CODE BELOW. It will take a long time to execute as grid search is a very computationally expensive algorithm. I will just print my output instead

# In[29]:


#grid_search = GridSearchCV(xgb_clf, param_grid=param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

#grid_search.fit(X_train, y_train,
           #     eval_set=[(X_valid, y_valid)])

#print("Best parameters: ", grid_search.best_params_)
#print("Best F1 score: ", grid_search.best_score_)


# In[30]:


# Best parameters:  {'gamma': 0, 'learning_rate': 1, 'max_depth': 9, 'n_estimators': 1000}
# Best F1 score:  0.9677071291204464


# In[31]:


# Save the trained model as a serialized file
with open('model.pkl', 'wb') as file:
    pickle.dump(xgb_clf, file)