import pandas as pd
import numpy as np
# loading the df
df = pd.read_excel(r'C:\Users\tulio\OneDrive\JupyterLab\case_scotia\Study Case.xlsx', sheet_name='BankChurners')
Field | Description |
---|---|
CLIENTNUM | Customer number |
Attrition_Flag | Flag to indicate customer attrition |
Customer_Age | Customer Age |
Gender | Gender |
Dependent_count | Number of customer dependants |
Education_Level | Education level |
Marital_Status | Marital Status |
Income_Category | Customer Income Category |
Card_Category | Type of credit Card |
Months_on_book | Time (months) with the bank |
Total_Relationship_Count | Number of accounts/products the customers has with the bank |
Months_Inactive_12_mon | Months of inactivity in the last 12 months |
Contacts_Count_12_mon | Marketing Campaigns where the customer was reached out |
Credit_Limit | Limit of Credit on the Credit card |
Total_Revolving_Bal | The balance that carries over from one month to the next on the credit card |
Avg_Open_To_Buy | Avg Credit Amount Available on the Credit Card |
Total_Amt_Chng_Q4_Q1 | Quarter over Quarter change in the amounts of charges in the credit card |
Total_Trans_Amt | Total Amount of transactions |
Total_Trans_Ct | Number of transactions |
Total_Ct_Chng_Q4_Q1 | Quarter over Quarter transactions count variation |
Avg_Utilization_Ratio | It's how much the customers owe divided by their credit limit |
pd.set_option('display.max_columns', None)
df.head() # viewing the header and first lines
CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
3 | 769911858 | Existing Customer | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
# to understand general dataframe information
df.info()
df.shape
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 10127 non-null object 6 Marital_Status 10127 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
(10127, 21)
# understanding what (unique) values we have in columns that appear to be categorical
# K means reading elements in the order they occur in memory.
# ravel returns an adjacent flat 1-D array containing the elements
pd.unique(df[['Attrition_Flag', 'Gender', 'Education_Level',
'Marital_Status', 'Income_Category', 'Card_Category']].values.ravel('K'))
array(['Existing Customer', 'Attrited Customer', 'M', 'F', 'High School', 'Graduate', 'Uneducated', 'Unknown', 'College', 'Post-Graduate', 'Doctorate', 'Married', 'Single', 'Divorced', '$60K - $80K', 'Less than $40K', '$80K - $120K', '$40K - $60K', '$120K +', 'Blue', 'Gold', 'Silver', 'Platinum'], dtype=object)
sum(df.duplicated(keep='first')) # return sum of duplicate rows
0
df['CLIENTNUM'].is_unique # equals true in case of no duplicates
True
# checking for negative values
s = df.select_dtypes(np.number).lt(0).any()\
.reindex(df.columns, fill_value=False)\
.rename_axis("col").reset_index(name='isnegative')
print(s)
col isnegative 0 CLIENTNUM False 1 Attrition_Flag False 2 Customer_Age False 3 Gender False 4 Dependent_count False 5 Education_Level False 6 Marital_Status False 7 Income_Category False 8 Card_Category False 9 Months_on_book False 10 Total_Relationship_Count False 11 Months_Inactive_12_mon False 12 Contacts_Count_12_mon False 13 Credit_Limit False 14 Total_Revolving_Bal False 15 Avg_Open_To_Buy False 16 Total_Amt_Chng_Q4_Q1 False 17 Total_Trans_Amt False 18 Total_Trans_Ct False 19 Total_Ct_Chng_Q4_Q1 False 20 Avg_Utilization_Ratio False
# Convert datatype of Credit limit and Avg_Open_To_Buy to int, to save memory;
cols = ['Credit_Limit', 'Avg_Open_To_Buy'] + df.select_dtypes(include=np.int64).columns.tolist()
df[cols] = df[cols].round(0).astype('int')
# convert flags datatype to bool
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})
#df['Attrition_Flag'] = df['Attrition_Flag'].astype('bool')
# drop and rename columns
df = df.drop(['Avg_Open_To_Buy'], axis=1)
df.rename(columns={'Total_Revolving_Bal': 'Avg_Total_Revolving_Bal'}, inplace=True)
# applying calculations and creating columns
df['Purchase_Weight'] = (df['Total_Trans_Amt'] / df['Total_Trans_Ct']).round(3)
df['Increased_Amt_Chng_Q4_Q1'] = [1 if x>1 else 0 for x in df['Total_Amt_Chng_Q4_Q1']]
df['Increased_Ct_Chng_Q4_Q1'] = [1 if x>1 else 0 for x in df['Total_Ct_Chng_Q4_Q1']]
# adjust order of columns
df = df[['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Avg_Total_Revolving_Bal', 'Avg_Utilization_Ratio', 'Total_Trans_Amt', 'Total_Amt_Chng_Q4_Q1', 'Increased_Amt_Chng_Q4_Q1', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Increased_Ct_Chng_Q4_Q1', 'Purchase_Weight']]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int32 1 Attrition_Flag 10127 non-null int64 2 Customer_Age 10127 non-null int32 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int32 5 Education_Level 10127 non-null object 6 Marital_Status 10127 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int32 10 Total_Relationship_Count 10127 non-null int32 11 Months_Inactive_12_mon 10127 non-null int32 12 Contacts_Count_12_mon 10127 non-null int32 13 Credit_Limit 10127 non-null int32 14 Avg_Total_Revolving_Bal 10127 non-null int32 15 Avg_Utilization_Ratio 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int32 17 Total_Amt_Chng_Q4_Q1 10127 non-null float64 18 Increased_Amt_Chng_Q4_Q1 10127 non-null int64 19 Total_Trans_Ct 10127 non-null int32 20 Total_Ct_Chng_Q4_Q1 10127 non-null float64 21 Increased_Ct_Chng_Q4_Q1 10127 non-null int64 22 Purchase_Weight 10127 non-null float64 dtypes: float64(4), int32(11), int64(3), object(5) memory usage: 1.4+ MB
df.head()
CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Avg_Total_Revolving_Bal | Avg_Utilization_Ratio | Total_Trans_Amt | Total_Amt_Chng_Q4_Q1 | Increased_Amt_Chng_Q4_Q1 | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Increased_Ct_Chng_Q4_Q1 | Purchase_Weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 768805383 | 0 | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691 | 777 | 0.061 | 1144 | 1.335 | 1 | 42 | 1.625 | 1 | 27.238 |
1 | 818770008 | 0 | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256 | 864 | 0.105 | 1291 | 1.541 | 1 | 33 | 3.714 | 1 | 39.121 |
2 | 713982108 | 0 | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418 | 0 | 0.000 | 1887 | 2.594 | 1 | 20 | 2.333 | 1 | 94.350 |
3 | 769911858 | 0 | 40 | F | 4 | High School | Unknown | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313 | 2517 | 0.760 | 1171 | 1.405 | 1 | 20 | 2.333 | 1 | 58.550 |
4 | 709106358 | 0 | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716 | 0 | 0.000 | 816 | 2.175 | 1 | 28 | 2.500 | 1 | 29.143 |
Saving the file for exploratory analysis
df.to_csv("df_cleaned.csv", index=False)