#!/usr/bin/env python # coding: utf-8 #

Case Study - Scotia

# # Overview & Cleaning # In[101]: import pandas as pd import numpy as np # In[102]: # loading the df df = pd.read_excel(r'C:\Users\tulio\OneDrive\JupyterLab\case_scotia\Study Case.xlsx', sheet_name='BankChurners') # |Field |Description| # |:-------------|----- | # |CLIENTNUM |Customer number | # |Attrition_Flag |Flag to indicate customer attrition | # |Customer_Age |Customer Age | # |Gender |Gender | # |Dependent_count |Number of customer dependants | # |Education_Level |Education level | # |Marital_Status |Marital Status | # |Income_Category |Customer Income Category | # |Card_Category |Type of credit Card | # |Months_on_book |Time (months) with the bank | # |Total_Relationship_Count |Number of accounts/products the customers has with the bank | # |Months_Inactive_12_mon |Months of inactivity in the last 12 months | # |Contacts_Count_12_mon |Marketing Campaigns where the customer was reached out | # |Credit_Limit |Limit of Credit on the Credit card | # |Total_Revolving_Bal |The balance that carries over from one month to the next on the credit card | # |Avg_Open_To_Buy |Avg Credit Amount Available on the Credit Card | # |Total_Amt_Chng_Q4_Q1 |Quarter over Quarter change in the amounts of charges in the credit card | # |Total_Trans_Amt |Total Amount of transactions | # |Total_Trans_Ct |Number of transactions | # |Total_Ct_Chng_Q4_Q1 |Quarter over Quarter transactions count variation | # |Avg_Utilization_Ratio |It's how much the customers owe divided by their credit limit | # # In[103]: pd.set_option('display.max_columns', None) df.head() # viewing the header and first lines # In[104]: # to understand general dataframe information df.info() df.shape # In[105]: # understanding what (unique) values we have in columns that appear to be categorical # K means reading elements in the order they occur in memory. # ravel returns an adjacent flat 1-D array containing the elements pd.unique(df[['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']].values.ravel('K')) # In[106]: sum(df.duplicated(keep='first')) # return sum of duplicate rows # In[107]: df['CLIENTNUM'].is_unique # equals true in case of no duplicates # In[108]: # checking for negative values s = df.select_dtypes(np.number).lt(0).any()\ .reindex(df.columns, fill_value=False)\ .rename_axis("col").reset_index(name='isnegative') print(s) # ### Notes for treatment # - No missing values; # - No negative values; # - No duplicate lines; # - Convert datatype of *Credit limit* and *Avg_Open_To_Buy* to int, to save memory; # - Convert datatype of Attrition_Flag to bool, to save memory; # - No need for CLIENTNUM, an index will do the job as there are no duplicates; # - No need for 'Avg_Open_To_Buy', as Credit_Limit, Total_Revolving_Bal and Avg_Utilization_Ratio are enough and more useful (also, taking it out will avoid multicolinearity); # - Change 'Total_Revolving_Bal' to 'Avg_Total_Revolving_Bal' - so we know what we are talking about (It's an average, since Avg_Open_To_Buy and Avg_Utilization_Ratio). # In[109]: # Convert datatype of Credit limit and Avg_Open_To_Buy to int, to save memory; cols = ['Credit_Limit', 'Avg_Open_To_Buy'] + df.select_dtypes(include=np.int64).columns.tolist() df[cols] = df[cols].round(0).astype('int') # In[110]: # convert flags datatype to bool df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1}) #df['Attrition_Flag'] = df['Attrition_Flag'].astype('bool') # In[111]: # drop and rename columns df = df.drop(['Avg_Open_To_Buy'], axis=1) df.rename(columns={'Total_Revolving_Bal': 'Avg_Total_Revolving_Bal'}, inplace=True) # ### New features # In[112]: # applying calculations and creating columns df['Purchase_Weight'] = (df['Total_Trans_Amt'] / df['Total_Trans_Ct']).round(3) df['Increased_Amt_Chng_Q4_Q1'] = [1 if x>1 else 0 for x in df['Total_Amt_Chng_Q4_Q1']] df['Increased_Ct_Chng_Q4_Q1'] = [1 if x>1 else 0 for x in df['Total_Ct_Chng_Q4_Q1']] # In[113]: # adjust order of columns df = df[['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Avg_Total_Revolving_Bal', 'Avg_Utilization_Ratio', 'Total_Trans_Amt', 'Total_Amt_Chng_Q4_Q1', 'Increased_Amt_Chng_Q4_Q1', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Increased_Ct_Chng_Q4_Q1', 'Purchase_Weight']] # In[114]: df.info() # In[115]: df.head() # **Saving the file for exploratory analysis** # In[116]: df.to_csv("df_cleaned.csv", index=False)