#!/usr/bin/env python
# coding: utf-8
#
Case Study - Scotia
# # Overview & Cleaning
# In[101]:
import pandas as pd
import numpy as np
# In[102]:
# loading the df
df = pd.read_excel(r'C:\Users\tulio\OneDrive\JupyterLab\case_scotia\Study Case.xlsx', sheet_name='BankChurners')
# |Field |Description|
# |:-------------|----- |
# |CLIENTNUM |Customer number |
# |Attrition_Flag |Flag to indicate customer attrition |
# |Customer_Age |Customer Age |
# |Gender |Gender |
# |Dependent_count |Number of customer dependants |
# |Education_Level |Education level |
# |Marital_Status |Marital Status |
# |Income_Category |Customer Income Category |
# |Card_Category |Type of credit Card |
# |Months_on_book |Time (months) with the bank |
# |Total_Relationship_Count |Number of accounts/products the customers has with the bank |
# |Months_Inactive_12_mon |Months of inactivity in the last 12 months |
# |Contacts_Count_12_mon |Marketing Campaigns where the customer was reached out |
# |Credit_Limit |Limit of Credit on the Credit card |
# |Total_Revolving_Bal |The balance that carries over from one month to the next on the credit card |
# |Avg_Open_To_Buy |Avg Credit Amount Available on the Credit Card |
# |Total_Amt_Chng_Q4_Q1 |Quarter over Quarter change in the amounts of charges in the credit card |
# |Total_Trans_Amt |Total Amount of transactions |
# |Total_Trans_Ct |Number of transactions |
# |Total_Ct_Chng_Q4_Q1 |Quarter over Quarter transactions count variation |
# |Avg_Utilization_Ratio |It's how much the customers owe divided by their credit limit |
#
# In[103]:
pd.set_option('display.max_columns', None)
df.head() # viewing the header and first lines
# In[104]:
# to understand general dataframe information
df.info()
df.shape
# In[105]:
# understanding what (unique) values we have in columns that appear to be categorical
# K means reading elements in the order they occur in memory.
# ravel returns an adjacent flat 1-D array containing the elements
pd.unique(df[['Attrition_Flag', 'Gender', 'Education_Level',
'Marital_Status', 'Income_Category', 'Card_Category']].values.ravel('K'))
# In[106]:
sum(df.duplicated(keep='first')) # return sum of duplicate rows
# In[107]:
df['CLIENTNUM'].is_unique # equals true in case of no duplicates
# In[108]:
# checking for negative values
s = df.select_dtypes(np.number).lt(0).any()\
.reindex(df.columns, fill_value=False)\
.rename_axis("col").reset_index(name='isnegative')
print(s)
# ### Notes for treatment
# - No missing values;
# - No negative values;
# - No duplicate lines;
# - Convert datatype of *Credit limit* and *Avg_Open_To_Buy* to int, to save memory;
# - Convert datatype of Attrition_Flag to bool, to save memory;
# - No need for CLIENTNUM, an index will do the job as there are no duplicates;
# - No need for 'Avg_Open_To_Buy', as Credit_Limit, Total_Revolving_Bal and Avg_Utilization_Ratio are enough and more useful (also, taking it out will avoid multicolinearity);
# - Change 'Total_Revolving_Bal' to 'Avg_Total_Revolving_Bal' - so we know what we are talking about (It's an average, since Avg_Open_To_Buy and Avg_Utilization_Ratio).
# In[109]:
# Convert datatype of Credit limit and Avg_Open_To_Buy to int, to save memory;
cols = ['Credit_Limit', 'Avg_Open_To_Buy'] + df.select_dtypes(include=np.int64).columns.tolist()
df[cols] = df[cols].round(0).astype('int')
# In[110]:
# convert flags datatype to bool
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})
#df['Attrition_Flag'] = df['Attrition_Flag'].astype('bool')
# In[111]:
# drop and rename columns
df = df.drop(['Avg_Open_To_Buy'], axis=1)
df.rename(columns={'Total_Revolving_Bal': 'Avg_Total_Revolving_Bal'}, inplace=True)
# ### New features
# In[112]:
# applying calculations and creating columns
df['Purchase_Weight'] = (df['Total_Trans_Amt'] / df['Total_Trans_Ct']).round(3)
df['Increased_Amt_Chng_Q4_Q1'] = [1 if x>1 else 0 for x in df['Total_Amt_Chng_Q4_Q1']]
df['Increased_Ct_Chng_Q4_Q1'] = [1 if x>1 else 0 for x in df['Total_Ct_Chng_Q4_Q1']]
# In[113]:
# adjust order of columns
df = df[['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Avg_Total_Revolving_Bal', 'Avg_Utilization_Ratio', 'Total_Trans_Amt', 'Total_Amt_Chng_Q4_Q1', 'Increased_Amt_Chng_Q4_Q1', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Increased_Ct_Chng_Q4_Q1', 'Purchase_Weight']]
# In[114]:
df.info()
# In[115]:
df.head()
# **Saving the file for exploratory analysis**
# In[116]:
df.to_csv("df_cleaned.csv", index=False)