The objective of this notebook is to present an extensive analysis of the IBM Customer Churn Dataset and to predict the customer churn rate.
Dataset Source :
GitHub Project Repository :
NB: This project also serves as my assignments for the courses below -
You can also view this notebook on kaggle
!pip install --upgrade scipy # to calculate Cramer's V latest version of scipy needed
"""Import basic modules."""
import math
import os
import gc
import random
import pprint
import numpy as np # For linear algebra
import pandas as pd # For data manipulation
import matplotlib.pyplot as plt # For 2D visualization
import seaborn as sns
# Warning Libraries
import warnings
warnings.filterwarnings("ignore")
# warnings.simplefilter(action='ignore', category=FutureWarning)
from collections import Counter
from scipy import stats # For statistics
from scipy.stats.contingency import association # upgrade scipy to use this to calculate Cramer's V
"""Plotly visualization"""
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.preprocessing import PowerTransformer # convert to Gaussian-like data
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, RepeatedStratifiedKFold, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# Algorithms
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
# Boosting Algorithms
!pip install catboost
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
!pip install optuna
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
import multiprocessing
import pickle, joblib
from sklearn.metrics import matthews_corrcoef, roc_auc_score, precision_recall_curve, confusion_matrix, classification_report, roc_curve, auc
from sklearn.utils import resample
from IPython.display import Markdown, display
# utility function to print markdown string
def printmd(string):
display(Markdown(string))
# customize as needed
plt_params = {
# 'figure.facecolor': 'white',
'axes.facecolor' : 'white',
## to set size
# 'legend.fontsize': 'x-large',
# 'figure.figsize': (15, 10),
# 'axes.labelsize': 'x-large',
# 'axes.titlesize': 'x-large',
# 'xtick.labelsize': 'x-large',
# 'ytick.labelsize': 'x-large'
}
plt.rcParams.update(plt_params)
sns.set_style('whitegrid')
# init_notebook_mode(connected=True)
# pio.renderers.default='notebook' # to display plotly graph
%matplotlib inline
!pip freeze | grep optuna
!pip freeze | grep xgboost
!pip freeze | grep catboost
!pip freeze | grep lightgbm
!pip freeze | grep plotly
!pip freeze | grep scipy
!pip freeze | grep scikit-learn
# padding value
left_padding = 21
# seed value
SEED = 42
# set optuna verbosity level
optuna_verbosity = optuna.logging.WARNING # https://optuna.readthedocs.io/en/latest/reference/logging.html#module-optuna.logging
def seed_everything(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
seed_everything(SEED)
Feature Name | Description | Data Type |
---|---|---|
customerID | Contains customer ID | categorical |
gender | whether the customer female or male | categorical |
SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) | numeric, int |
Partner | Whether the customer has a partner or not (Yes, No) | categorical |
Dependents | Whether the customer has dependents or not (Yes, No) | categorical |
tenure | Number of months the customer has stayed with the company | numeric, int |
PhoneService | Whether the customer has a phone service or not (Yes, No) | categorical |
MultipleLines | Whether the customer has multiple lines r not (Yes, No, No phone service) | categorical |
InternetService | Customer’s internet service provider (DSL, Fiber optic, No) | categorical |
OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) | categorical |
OnlineBackup | Whether the customer has online backup or not (Yes, No, No internet service) | categorical |
DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) | categorical |
TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) | categorical |
streamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) | categorical |
streamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) | categorical |
Contract | The contract term of the customer (Month-to-month, One year, Two year) | categorical |
PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) | categorical |
PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer, Credit card) | categorical |
MonthlyCharges | The amount charged to the customer monthly | numeric , int |
TotalCharges | The total amount charged to the customer | object |
Churn | Whether the customer churned or not (Yes or No) | categorical |
df_churn = pd.read_csv("https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv")
df_churn.head()
print(f"Dataset Dimension: {df_churn.shape[0]} rows, {df_churn.shape[1]} columns")
df_churn.info()
printmd("<br>**SeniorCitizen** is already converted to ineteger<br><br>**TotalCharges** should be converted to float")
Drop customerID column
del df_churn["customerID"]
df_churn.describe(include=['object']).T
print('Known observations: {}\nUnique observations: {}'.format(len(df_churn.index),len(df_churn.drop_duplicates().index)))
printmd("**No duplicates Found!**")
printmd("**Unique Values By Features**")
for feature in df_churn.columns:
uniq = np.unique(df_churn[feature])
print(feature.ljust(left_padding),len(uniq))
df_churn.isna().sum()
cat_cols = set(df_churn.columns) - set(df_churn._get_numeric_data().columns)
printmd("'**isna**' is only applicable for numerical data type<br>")
printmd("Checking missing values for object data type<br><br>")
for cat in cat_cols:
print(cat.ljust(left_padding), df_churn[cat].apply(lambda x:len(x.strip()) == 0 or x.strip().lower() == 'nan').sum())
printmd("<br>TotalCharges is an object datatype, it has **11** 'nan' value")
Convert TotalCharges to numeric
df_churn["TotalCharges"] = pd.to_numeric(df_churn["TotalCharges"], errors = 'coerce')
indices_null_tc = df_churn[df_churn["TotalCharges"].isna()].index
display(df_churn.iloc[indices_null_tc])
printmd("<br>**'Tenure' (months stayed at the company) is correlated with 'TotalCharges' column**")
printmd("**when 'Tenure' is 0 , 'TotalCharges' is 0 too**")
display(df_churn[df_churn.tenure == 1].head(2))
printmd("<br>**'TotalCharges' is the same as 'MonthlyCharges' when 'Tenure' is not 0**")
display(df_churn[df_churn.tenure == 3].head(2))
printmd("<br>**'TotalCharges' increases with respect to 'MonthlyCharges' and 'Tenure'**")
printmd("<br>From the above observation we can conclude that, **'TotalCharges' = 'MonthlyCharges' x 'Tenure' + Extra Cost**")
printmd("**Therefore, imputing missing values on 'TotalCharges' column with 0**")
df_churn['TotalCharges'].fillna(0, inplace=True)
df_churn[['tenure', 'MonthlyCharges', 'TotalCharges']].describe().T
There are three numerical data types which can be ranked based on their values :
We can bin them into three levels : high, medium and low
def binning_feature(feature):
plt.hist(df_churn[feature])
# set x/y labels and plot title
plt.xlabel(f"{feature.title()}")
plt.ylabel("Count")
plt.title(f"{feature.title()} Bins")
plt.show()
bins = np.linspace(min(df_churn[feature]), max(df_churn[feature]), 4)
printmd("**Value Range**")
printmd(f"Low ({bins[0] : .2f} - {bins[1]: .2f})")
printmd(f"Medium ({bins[1]: .2f} - {bins[2]: .2f})")
printmd(f"High ({bins[2]: .2f} - {bins[3]: .2f})")
group_names = ['Low', 'Medium', 'High']
df_churn.insert(df_churn.shape[1]-1,f'{feature}-binned', pd.cut(df_churn[feature], bins, labels=group_names, include_lowest=True))
display(df_churn[[feature, f'{feature}-binned']].head(10))
# count values
printmd("<br>**Binning Distribution**<br>")
display(df_churn[f'{feature}-binned'].value_counts())
# plot the distribution of each bin
plt.bar(group_names, df_churn[f'{feature}-binned'].value_counts())
# px.bar(data_canada, x='year', y='pop')
# set x/y labels and plot title
plt.xlabel(f"{feature.title()}")
plt.ylabel("Count")
plt.title(f"{feature.title()} Bins")
plt.show()
binning_feature('tenure')
binning_feature('MonthlyCharges')
binning_feature('TotalCharges')
Data Types Distribution after cleaning
printmd("**Data Types**<br>")
df_churn.dtypes.value_counts()
Normality tests are used to determine if a dataset is normally distributed and to check how likely it is for a random variable in the dataset to be normally distributed.
Popular normality tests - D’Agostino’s K^2, Shapiro-Wilk, Anderson-Darling .
There are three numerical features in this dataset - MonthlyCharges, Tenure, and TotalCharges.
Hypotheses -
NB : we can not perform Shapiro-Wilk Test because sample size > 5000 and for this test p-value may not be accurate for N > 5000
stat, p = stats.normaltest(df_churn['MonthlyCharges'])
print('Statistics=%.5f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print('Sample looks Gaussian (fail to reject H0)')
else:
print('Sample does not look Gaussian (reject H0)')
stat, p = stats.normaltest(df_churn['tenure'])
print('Statistics=%.5f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print('Sample looks Gaussian (fail to reject H0)')
else:
print('Sample does not look Gaussian (reject H0)')
Hypotheses -
Critical values in a statistical test are a range of pre-defined significance boundaries at which the H0 can be failed to be rejected if the calculated statistic is less than the critical value. \ Rather than just a single p-value, the test returns a critical value for a range of different commonly used significance levels. \ In this case - normal/exponential (15%, 10%, 5%, 2.5%, 1%)
result = stats.anderson(df_churn['TotalCharges'])
print('Statistic: %.3f' % result.statistic)
p = 0
for i in range(len(result.critical_values)):
sl, cv = result.significance_level[i], result.critical_values[i]
if result.statistic < result.critical_values[i]:
print(f'Significance level {sl:.2f} % : critical value {cv:.3f}, data looks normal (fail to reject H0)')
else:
print(f'Significance level {sl:.2f} % : critical value {cv:.3f}, data does not look normal (reject H0)')
fig = px.pie(df_churn['Churn'].value_counts().reset_index().rename(columns={'index':'Type'}), values='Churn', names='Type', title='Churn (Target) Distribution')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
printmd("### Target distribution is Imbalanced")
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}],[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=df_churn['OnlineSecurity'].value_counts().index, values=df_churn['OnlineSecurity'].value_counts().values, name="Online Security"),
1, 1)
fig.add_trace(go.Pie(labels=df_churn['OnlineBackup'].value_counts().index, values=df_churn['OnlineBackup'].value_counts().values, name="Online Backup"),
1, 2)
fig.add_trace(go.Pie(labels=df_churn['DeviceProtection'].value_counts().index, values=df_churn['DeviceProtection'].value_counts().values, name="Device Protection"),
2, 1)
fig.add_trace(go.Pie(labels=df_churn['TechSupport'].value_counts().index, values=df_churn['TechSupport'].value_counts().values, name="Tech Support"),
2, 2)
# donut-like pie chart
fig.update_traces(hole=.5, hoverinfo="label+percent")
fig.update_layout(
# Add annotations in the center of the donut pies.
annotations=[dict(text='Online<br>Security', x=0.195, y=0.85, font_size=20, showarrow=False),
dict(text='Online<br>Backup', x=0.805, y=0.84, font_size=20, showarrow=False),
dict(text='Device<br>Protection', x=0.185, y=0.18, font_size=20, showarrow=False),
dict(text='Tech<br>Support', x=0.805, y=0.18, font_size=20, showarrow=False)])
fig.update_layout(margin=dict(t=0, b=0, l=0, r=0))
fig.show()
printmd("### 'Online Backup', 'Device Protection' and 'Online Security', 'Tech Support' has similar distribution")
display(px.pie(df_churn['PaymentMethod'].value_counts().reset_index().rename(columns={'index':'Type'}), values='PaymentMethod', names='Type', title='Payment Method Distribution'))
printmd("#### Most of the customers use E-check")
sns.catplot(x="gender", kind="count", data=df_churn)
plt.show()
printmd("#### Approximately 50/50 gender ratio")
sns.catplot(x="Dependents", kind="count", data=df_churn)
plt.show()
printmd("#### Users who have non-dependents are approximately two times more than users having dependents")
sns.catplot(x="SeniorCitizen", kind="count", data=df_churn)
plt.show()
printmd("#### The majority of the users are not Senior Citizen")
sns.catplot(x="Contract", kind="count", data=df_churn)
plt.show()
printmd("#### Most of the users prefer Month-to-month contract")
sns.catplot(x="PaperlessBilling", kind="count", data=df_churn)
plt.show()
printmd("#### Most of the users prefer paperless billing")
sns.boxplot(x=df_churn["TotalCharges"])
plt.show()
printmd("#### The total charges fall under 4000 for majority of the users")
"""#1.Create a function to plot histogram and density plot."""
def plot_histogram(feature):
"""Plots histogram and density plot of a variable."""
# Create subplot object
fig = make_subplots(
rows=2,
cols=1,
print_grid=False,
subplot_titles=(f"Distribution of {feature.name} with Histogram", f"Distribution of {feature.name} with Density Plot"))
# This is a count histogram
fig.add_trace(
go.Histogram(
x = feature,
hoverinfo="x+y"
),
row=1,col=1)
# This is a density histogram
fig.add_trace(
go.Histogram(
x = feature,
hoverinfo="x+y",
histnorm = "density"
),
row=2,col=1)
# Update layout
fig.layout.update(
height=800,
width=870,
hovermode="closest"
)
# Update axes
fig.layout.yaxis1.update(title="<b>Abs Frequency</b>")
fig.layout.yaxis2.update(title="<b>Density(%)</b>")
fig.layout.xaxis2.update(title=f"<b>{feature.name}</b>")
return fig.show()
plot_histogram(df_churn['tenure'])
printmd("**Tenure is U-shaped distributed**")
plot_histogram(df_churn['MonthlyCharges'])
printmd("**MonthlyCharges is heavily skewed**")
plot_histogram(df_churn['TotalCharges'])
printmd("**TotalCharges is reversed J-shaped distributed**")
In this section, I did an extensive statistical analysis with various hypotheses testing based on paired data types like -
General Hypotheses -
# Check cardinality of categorical variables
target_col_filter = df_churn.loc[:, df_churn.columns != 'Churn']
cat_cols = list(set(target_col_filter.columns) - set(target_col_filter._get_numeric_data().columns))
num_cols = list(set(target_col_filter._get_numeric_data().columns) - set({'SeniorCitizen'})) # already converted to integer
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: target_col_filter[col].nunique(), cat_cols))
dict_features_by_col = dict(zip(cat_cols, object_nunique))
# Print number of unique entries by column, in ascending order
print(sorted(dict_features_by_col.items(), key=lambda x: x[1]))
ordinal_cols = ['tenure-binned', 'MonthlyCharges-binned', 'TotalCharges-binned']
dichotomous_cols = [cat for cat in cat_cols if df_churn[cat].value_counts().count() == 2]
polytomous_cols = list(set(cat_cols) - set(dichotomous_cols) - set(ordinal_cols))
print("Categorical Columns".ljust(left_padding), cat_cols)
print("Numerical Columns".ljust(left_padding), num_cols)
print("Ordinal Columns".ljust(left_padding), ordinal_cols)
print("Dichotomous Columns".ljust(left_padding), dichotomous_cols)
print("Polytomous Columns".ljust(left_padding), polytomous_cols)
Categorical Columns
'TechSupport', 'DeviceProtection', 'Contract', 'PaperlessBilling', 'TotalCharges-binned',
'gender', 'OnlineBackup', 'InternetService', 'StreamingTV', 'tenure-binned',
'Dependents', 'PhoneService', 'StreamingMovies', 'MultipleLines', 'Partner',
'MonthlyCharges-binned', 'OnlineSecurity', 'PaymentMethod'
Numerical Columns
'MonthlyCharges', 'tenure', 'TotalCharges'
Ordinal Columns
tenure-binned', 'MonthlyCharges-binned', 'TotalCharges-binned'
Dichotomous Columns
'PaperlessBilling', 'gender', 'Dependents', 'PhoneService', 'Partner'
Polytomous Columns
'TechSupport', 'StreamingMovies', 'DeviceProtection', 'MultipleLines', 'Contract',
'InternetService', 'OnlineSecurity', 'StreamingTV', 'PaymentMethod', 'OnlineBackup'
AKA Spearman's rho or Spearman correlation coefficient\ Applied to Continuous or ordinal
Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed
Pearson correlation assumes normality, linearity and homoscedasticity.Pearson's correlation is also not able to tell the difference between dependent and independent variables
Hypotheses -
For Pearson r correlation, both variables should be normally distributed
According to the normality test tenure, MonthlyCharges and TotalCharges columns are not normally distributed
def cal_spearmanr(c1, c2):
alpha = 0.05
correlation, p_value = stats.spearmanr(df_churn[c1], df_churn[c2])
print(f'{c1}, {c2} correlation : {correlation}, p : {p_value}')
if p_value > alpha:
print('Probably do not have monotonic relationship (fail to reject H0)')
else:
print('Probably have monotonic relationship (reject H0)')
cal_spearmanr('tenure','MonthlyCharges')
cal_spearmanr('tenure','TotalCharges')
cal_spearmanr('MonthlyCharges','TotalCharges')
AKA Kendall's τ or Kendall's Tau
Kendall’s Tau is often used for correlation on continuous data if there are outliers in the data
A τ test is a non-parametric hypothesis test for statistical dependence based on the τ coefficient
Variable would be continuous or ordinal
Ordinal variable examples -
In this dataset there are three ordinal features :
def kendall_rank_correlation(feature1, feature2):
coef, p_value = stats.kendalltau(df_churn[feature1], df_churn[feature2])
print(f"Correlation between {feature1} and {feature2} ")
print('Kendall correlation coefficient = %.5f, p = %.5f' % (coef, p_value))
# interpret the significance
alpha = 0.05
if p_value > alpha:
print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p_value)
else:
print('Samples are correlated (reject H0) p=%.3f' % p_value)
print('----\n')
ordinal_features = ['tenure-binned','MonthlyCharges-binned', 'TotalCharges-binned']
for ord in ordinal_features:
printmd(f"Correlation with **{ord}**")
kendall_rank_correlation('tenure',ord)
kendall_rank_correlation('MonthlyCharges',ord)
kendall_rank_correlation('TotalCharges',ord)
The Mann-Whitney U test is a nonparametric statistical significance test for determining whether two independent samples were drawn from a population with the same distribution.
The test determines whether the medians of two or more groups are different.
NB : For the test to be effective, it requires at least 20 observations in each data sample.
or
def mannwhitneyu_correlation(feature1):
stat, p_value = stats.mannwhitneyu(df_churn[feature1], (df_churn['Churn'] == 'Yes').astype(int))
print(f"Correlation between {feature1} and Churn")
print('Statistics = %.5f, p = %.5f' % (stat, p_value))
# interpret the significance
alpha = 0.05
if p_value > alpha:
print('Same distribution (fail to reject H0)')
else:
print('Different distribution (reject H0)')
print('----\n')
numerical_features = ['tenure','MonthlyCharges', 'TotalCharges']
for num in numerical_features:
printmd(f"Correlation with **{num}**")
mannwhitneyu_correlation(num)
Biserial correlation
The point biserial correlation is used to measure the relationship between a binary variable, x, and a continuous variable, y. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply a determinative relationship.
NB: point-biserial correlation is conducted with the Pearson correlation formula except that one of the variables is dichotomous
The assumptions for Point-Biserial correlation include:
Options to normalize a non-normal distribution -
# https://stackoverflow.com/questions/53624804/how-to-normalize-a-non-normal-distribution
data = df_churn['MonthlyCharges'].to_numpy()
pt = PowerTransformer(method='yeo-johnson') # ‘box-cox’, 'yeo-johnson'
data = data.reshape(-1, 1)
pt.fit(data)
transformed_data = pt.transform(data)
transformed_k2, transformed_p = stats.normaltest(transformed_data)
# other methods to transform into gaussian distribution
# stats.normaltest(np.log(df_churn['MonthlyCharges']))
# stats.normaltest(np.sqrt(df_churn['MonthlyCharges'])
# stats.normaltest(stats.boxcox(df_churn['MonthlyCharges'])[0])
# all other methods failed to convert into gaussian
alpha = 0.05
if transformed_p > alpha:
print('Sample looks Gaussian (fail to reject H0)')
else:
print('Sample does not look Gaussian (reject H0)')
Since the numerical columns can not be converted to gaussian distribution, point biseral correlation test can not be performed
For a dichotomous categorical variable and a continuous variable we can calculate a Pearson correlation if the categorical variable has a 0/1-coding for the categories. This correlation is then also known as a point-biserial correlation coefficient. (parametric test)
But when we have more than two categories for the categorical variable the Pearson correlation is not appropriate anymore. \ We should then use eta-squared, or eta, as an effect-size measure for the relationship of a categorical variable and a continuous variable.
def correlation_ratio(categories, measurements):
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
eta = np.sqrt(numerator/denominator)
return eta
correlation_ratio(df_churn['PaymentMethod'], df_churn['MonthlyCharges'])
correlation_ratio(df_churn['PaymentMethod'], df_churn['TotalCharges'])
correlation_ratio(df_churn['PaymentMethod'], df_churn['tenure'])
In classification, when both of them are categorical, then the strength of the relationship between them can be measured using a Chi-square test
printmd("**Correlation Between Dichotomous Features with Target : Churn**")
for col in dichotomous_cols:
print(col.ljust(left_padding), matthews_corrcoef(df_churn[col], df_churn['Churn']))
detect independence between 2 categorical variables, 2x2 or 2xMany
Test statistic in the context of the chi-squared distribution with the requisite number of degrees of freedom
In terms of a p-value and a chosen significance level (alpha):
# alpha/significance = 0.05
# If p-value <= alpha: significant result, reject null hypothesis (H0), dependent
# If p-value > alpha: not significant result, fail to reject null hypothesis (H0), independent
def calculate_chi_square(feature1, feature2='Churn'):
printmd(f"Correlation between **{feature1}** and **{feature2}**")
crosstab = pd.crosstab(df_churn[feature1], df_churn[feature2])
# display(crosstab)
stat, p, dof, expected = stats.chi2_contingency(crosstab,correction=True)
print(f'p-value : {p}, degree of freedom: {dof}')
# print("expected frequencies :\n", expected)
# interpret test-statistic
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
else:
print('Independent (fail to reject H0)')
print('-----------------------------------\n')
# credit : https://machinelearningmastery.com/chi-squared-test-for-machine-learning
printmd("**Chi-Squre Correlation Between Dichotomous Features with Target : Churn**")
for col in dichotomous_cols:
calculate_chi_square(col)
With 5% significance level 'PhoneService' and 'gender' features are not dependent with the target : Churn
printmd("**Chi-Squre Correlation Between Polytomous Features with Target : Churn**")
for col in polytomous_cols:
calculate_chi_square(col)
With 5% significance level All polytomous features are dependent with the target : Churn
It is based on a nominal variation of Pearson’s Chi-Square Test\ Like correlation, Cramer’s V is symmetrical — it is insensitive to swapping x and y
Cramer's V is used to examine the association between two categorical variables when there is more than a 2 X 2 contingency (e.g., 2 X 3).\ In these more complicated designs, phi correlation test is not appropriate, but Cramer's statistic is. Cramer's V represents the association or correlation between two variables. This statistic is also referred to as "Cramers Phi"
To know more about this, visit this article : the-search-for-categorical-correlation
def cramers_v(x, y):
""" calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
"""
confusion_matrix = pd.crosstab(x,y)
chi2 = stats.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
# credit : https://stackoverflow.com/a/46498792/11105356
printmd("**Correlation Between Polytomous Features with Target : Churn**")
cramer_v_val_dict = {}
for col in polytomous_cols:
cramer_v_val_dict[col] = cramers_v(df_churn[col], df_churn['Churn'])
cramer_v_val_dict_sorted = sorted(cramer_v_val_dict.items(), key=lambda x:x[1], reverse=True)
for k,v in cramer_v_val_dict_sorted:
print(k.ljust(left_padding), v)
printmd("<br>**Contract, OnlineSecurity, TechSupport, InternetService are moderately correlated with Churn**<br>")
printmd("**Cramers V Heatmap on Polytomous Features and Target: Churn**")
cramers_v_val = pd.DataFrame(index=['Churn'], columns=polytomous_cols)
for j in range(0,len(polytomous_cols)):
u = cramers_v(df_churn['Churn'], df_churn[polytomous_cols[j]])
cramers_v_val.loc[:,polytomous_cols[j]] = u
cramers_v_val.fillna(value=np.nan,inplace=True)
plt.figure(figsize=(20,1))
sns.heatmap(cramers_v_val,annot=True,fmt='.3f', cmap="YlGnBu")
plt.show()
crosstab = pd.crosstab(df_churn['OnlineSecurity'], df_churn['Churn'])
display(crosstab)
printmd(f"Association between OnlineSecurity and Target:Churn **{stats.contingency.association(crosstab, method='cramer')}**")
AKA Theil’s U - an asymmetric measure of association between categorical features
It is is based on the conditional entropy between x and y — or in human language, given the value of x, how many possible states does y have, and how often do they occur.
Formaly marked as U(x|y); Just like Cramer’s V, the output value is on the range of [0,1], where 0 means that feature y provides no information about feature x, and 1 means that feature y provides full information about features x's value
Unlike Cramer’s V, it is asymmetric
So we will not lose any valuable information unlike symmetric tests
def conditional_entropy(x,y):
# entropy of x given y
y_counter = Counter(y)
xy_counter = Counter(list(zip(x,y)))
total_occurrences = sum(y_counter.values())
entropy = 0
for xy in xy_counter.keys():
p_xy = xy_counter[xy] / total_occurrences
p_y = y_counter[xy[1]] / total_occurrences
entropy += p_xy * math.log(p_y/p_xy)
return entropy
def theil_u(x,y):
s_xy = conditional_entropy(x,y)
x_counter = Counter(x)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
s_x = stats.entropy(p_x)
if s_x == 0:
return 1
else:
return (s_x - s_xy) / s_x
theilu = pd.DataFrame(index=['Churn'], columns=cat_cols)
for j in range(0,len(cat_cols)):
u = theil_u(df_churn['Churn'].tolist(),df_churn[cat_cols[j]].tolist())
theilu.loc[:,cat_cols[j]] = u
theilu.fillna(value=np.nan,inplace=True)
plt.figure(figsize=(20,1))
sns.heatmap(theilu,annot=True,fmt='.2f')
plt.show()
printmd("**Contract, OnlineSecurity, TechSupport, tenure-binned are moderately correlated with Churn**")
For categorical variables, multicollinearity can be detected with Spearman rank correlation coefficient (ordinal variables) and chi-square test (nominal variables)
For categorical and a continuous variable, multicollinearity can be measured by t-test (if the categorical variable has 2 categories, parametric) or ANOVA (more than 2 categories, parametric)
Spearman's ρ was already performed, proceeding with chi-square
calculate_chi_square('PaymentMethod','MultipleLines')
calculate_chi_square('PaymentMethod','PhoneService')
calculate_chi_square('PaymentMethod','Contract')
plt.figure(figsize=(10,6),dpi=100)
sns.kdeplot(df_churn.tenure, color='b', shade=True, Label='Tenure')
sns.kdeplot(df_churn.MonthlyCharges, color='r', shade=True, Label='Monthly Charges')
plt.xlabel('Tenure vs Monthly Charges')
plt.ylabel('Probability Density')
plt.legend()
plt.show()
printmd("""**Both are are not normally distributed, skewed,Tenure has a
Bi-modal distribution <br>Most users stayed for less than 20 months,
Monthly Charges for most people is nearly 20 unit**""")
# https://stackoverflow.com/a/65242391/11105356
df_g = df_churn.groupby(['StreamingTV', 'Churn']).size().reset_index()
df_g['percentage'] = df_churn.groupby(['StreamingTV', 'Churn']).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values
df_g.columns = ['StreamingTV', 'Churn', 'Counts', 'Percentage']
fig = px.bar(df_g, x='StreamingTV', y='Counts',
color='Churn',
color_discrete_map={
'Yes': '#99D594',
'No': '#FC8D59',
},
text=df_g['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)))
display(fig)
printmd("**Similar ratio between streamer vs non-streamer in churned users**")