#!/usr/bin/env python # coding: utf-8 # # Customer Churn Prediction # Last Update: 08/02/2020
# PyCaret Version: 2.0
# Author: Alexandre Farias
# Email: afarias@tuta.io # # ![](https://res.cloudinary.com/dn1j6dpd7/image/fetch/f_auto,q_auto,w_736/https://www.livechat.com/wp-content/uploads/2016/04/customer-churn@2x.jpg) # Image Source: [What Is Churn Rate and Why It Will Mess With Your Growth](https://www.livechat.com/success/churn-rate/) # # 1. Introduction # Customer Churn is when customers leave a service in a given period of time, which is bad for business.
# This work has as objective to build a machine learning model to predict which customers will leave the service, the dataset used on this notebook is the [Telco Customer Churn](https://www.kaggle.com/blastchar/telco-customer-churn) hosted at Kaggle. Also, an Exploratory Data Analysis is made to a better understand about the data. # Another point on this work is use Deepnote as development enviroment and the [PyCaret 2.0](https://pycaret.org/) Python Module to make all the experiment pipeline. # # 1.1 Enviroment Setup # The Modules used for this work, highlights for PyCaret 2.0 and good plots by [Plotly](https://plotly.com/). # In[1]: # Standard import pandas as pd import numpy as np import os # Pycaret from pycaret.classification import * # Plots from plotly.offline import iplot import plotly.graph_objects as go from plotly.subplots import make_subplots import seaborn as sns # Sklearn tools from sklearn.preprocessing import LabelEncoder from sklearn.metrics import * # Extras from datetime import date import warnings warnings.filterwarnings("ignore") # Datapath and Setup os.chdir("..") PATH = os.getcwd()+os.sep RANDOM_SEED = 142 K_FOLDS = 5 # And the helper functions used on this notebook. # In[2]: # Helper functions for structured data # Get info about the dataset def dataset_info(dataset, dataset_name: str): print(f"Dataset Name: {dataset_name} \ | Number of Samples: {dataset.shape[0]} \ | Number of Columns: {dataset.shape[1]}") print(30*"=") print("Column Data Type") print(dataset.dtypes) print(30*"=") missing_data = dataset.isnull().sum() if sum(missing_data) > 0: print(missing_data[missing_data.values > 0]) else: print("No Missing Data on this Dataset!") print(30*"=") print("Memory Usage: {} MB".\ format(np.round( dataset.memory_usage(index=True).sum() / 10e5, 3 ))) # Dataset Sampling def data_sampling(dataset, frac: float, random_seed: int): data_sampled_a = dataset.sample(frac=frac, random_state=random_seed) data_sampled_b = dataset.drop(data_sampled_a.index).reset_index(drop=True) data_sampled_a.reset_index(drop=True, inplace=True) return data_sampled_a, data_sampled_b # Bar Plot def bar_plot(data, plot_title: str, x_axis: str, y_axis: str): colors = ["#0080ff",] * len(data) colors[0] = "#ff8000" trace = go.Bar(y=data.values, x=data.index, text=data.values, marker_color=colors) layout = go.Layout(autosize=False, height=600, title={"text" : plot_title, "y" : 0.9, "x" : 0.5, "xanchor" : "center", "yanchor" : "top"}, xaxis={"title" : x_axis}, yaxis={"title" : y_axis},) fig = go.Figure(data=trace, layout=layout) fig.update_layout(template="simple_white") fig.update_traces(textposition="outside", textfont_size=14, marker=dict(line=dict(color="#000000", width=2))) fig.update_yaxes(automargin=True) iplot(fig) # Plot Pie Chart def pie_plot(data, plot_title: str): trace = go.Pie(labels=data.index, values=data.values) layout = go.Layout(autosize=False, title={"text" : plot_title, "y" : 0.9, "x" : 0.5, "xanchor" : "center", "yanchor" : "top"}) fig = go.Figure(data=trace, layout=layout) fig.update_traces(textfont_size=14, marker=dict(line=dict(color="#000000", width=2))) fig.update_yaxes(automargin=True) iplot(fig) # Histogram def histogram_plot(data, plot_title: str, y_axis: str): trace = go.Histogram(x=data) layout = go.Layout(autosize=False, title={"text" : plot_title, "y" : 0.9, "x" : 0.5, "xanchor" : "center", "yanchor" : "top"}, yaxis={"title" : y_axis}) fig = go.Figure(data=trace, layout=layout) fig.update_traces(marker=dict(line=dict(color="#000000", width=2))) fig.update_layout(template="simple_white") fig.update_yaxes(automargin=True) iplot(fig) # Particular case: Histogram subplot (1, 2) def histogram_subplot(dataset_a, dataset_b, feature_a: str, feature_b: str, title: str, title_a: str, title_b: str): fig = make_subplots(rows=1, cols=2, subplot_titles=( title_a, title_b ) ) fig.add_trace(go.Histogram(x=dataset_a[feature_a], showlegend=False), row=1, col=1) fig.add_trace(go.Histogram(x=dataset_b[feature_b], showlegend=False), row=1, col=2) fig.update_layout(template="simple_white") fig.update_layout(autosize=False, title={"text" : title, "y" : 0.9, "x" : 0.5, "xanchor" : "center", "yanchor" : "top"}, yaxis={"title" : "Frequency"}) fig.update_traces(marker=dict(line=dict(color="#000000", width=2))) fig.update_yaxes(automargin=True) iplot(fig) # Calculate scores with Test/Unseen labeled data def test_score_report(data_unseen, predict_unseen): le = LabelEncoder() data_unseen["Label"] = le.fit_transform(data_unseen.Churn.values) data_unseen["Label"] = data_unseen["Label"].astype(int) accuracy = accuracy_score(data_unseen["Label"], predict_unseen["Label"]) roc_auc = roc_auc_score(data_unseen["Label"], predict_unseen["Label"]) precision = precision_score(data_unseen["Label"], predict_unseen["Label"]) recall = recall_score(data_unseen["Label"], predict_unseen["Label"]) f1 = f1_score(data_unseen["Label"], predict_unseen["Label"]) df_unseen = pd.DataFrame({ "Accuracy" : [accuracy], "AUC" : [roc_auc], "Recall" : [recall], "Precision" : [precision], "F1 Score" : [f1] }) return df_unseen # Confusion Matrix def conf_mat(data_unseen, predict_unseen): unique_label = data_unseen["Label"].unique() cmtx = pd.DataFrame( confusion_matrix(data_unseen["Label"], predict_unseen["Label"], labels=unique_label), index=['{:}'.format(x) for x in unique_label], columns=['{:}'.format(x) for x in unique_label] ) ax = sns.heatmap(cmtx, annot=True, fmt="d", cmap="YlGnBu") ax.set_ylabel('Predicted') ax.set_xlabel('Target'); ax.set_title("Predict Unseen Confusion Matrix", size=14); # # 2. Load Data # # The Dataset is load as a Pandas dataframe and show a gimplse of the data. # A good thing about Deepnote is that the displayed dataframes shows the column type, helping to understand the features. # In[3]: dataset = pd.read_csv(PATH+"data"+os.sep+"customers.csv") dataset.head(3) # Check for duplicated samples. # In[4]: dataset[dataset.duplicated()] # There are no duplicated samples on the dataset.
# More information about the dataset is needed as the number of samples, memory size allocation, etc.
# The result is showed on the following output (The Data Type is just showed for convenience, to make this notebook useful on other enviroments). # In[5]: dataset_info(dataset, "customers") # The dataset has a small memory size allocation (1.183 MB) and is composed for many Categorical (object) features and only a few numeric, but one of the categorical features doesn't look right, the `TotalCharges`, as showed on the displayed dataframe, the festure is numeric.
# `TotalCharges` is converted from Object to float64, the same of `MonthlyCharges` feature. # In[6]: dataset["TotalCharges"] = pd.to_numeric(dataset["TotalCharges"], errors="coerce") print(f"The Feature TotalCharges is type {dataset.TotalCharges.dtype} now!") # # 3. Exploratory Data Analysis # # 3.1 Churn Distribution # The Client Churn Distribution is checked for any imbalance, as the feature is the target, it's important to choose what strategy to adopt when dealing with imbalanced classes.
# Below, a Pie Chart shows the feature distribution. # # In[7]: pie_plot(dataset["Churn"].value_counts(), plot_title="Client Churn Distribution") # There's some imbalance on Churn Distribution, 26.5% of the clients have churned, and small occurences of a label could lead to bad predictor.
# It's possible to choose some ways to work with this case: # * Make a random over-sampling, duplicating some samples of the minority class until this reach a balance, but this could lead to an overfitted model. # * Make a random down-sampling, removing some samples from the majority class until this reach a balance, but this leads to information loss and not feeding the model with the collected samples. # * Make a random down-sampling, removing some samples from the majority class until this reach a balance, but this leads to information loss and not feeding the model with the collected samples. # * Another resampling technique, as SMOTE. # * Choosing a metric that deals with imbalanced datasets, like F1 Score. # # The Churn problem is about client retention, is worth to check about false positives, so precision and recall metrics are a must for this situtation.
# F1 Score is used to check the quality of the model predictions, as the metric is an harmonic mean of precision and recall. # # 3.2 Analysis of the Contract Type # # The contract type is a good feature to analyze what happens to a client churn from that service, a plot from the contract types of not churned clients is showed below. # In[8]: df_aux = dataset.query('Churn == "No"') df_aux = df_aux["Contract"].value_counts() bar_plot(df_aux, "Contract Types of not Churned Clients", "Contract", "Count") # Is showed that a Month-to-month contract is the firts when compared to annual contracts, but the difference between the number of contracts is not so big.
# To a better comparation, the same plot is showed for the churned clients. # In[9]: df_aux = dataset.query('Churn == "Yes"') df_aux = df_aux["Contract"].value_counts() bar_plot(df_aux, "Contract Types of Churned Clients", "Contract", "Count") # Now, the difference between a Month-to-month and annual contractts is bigger, and can lead to a conclusion that annual contracts are better to retain the clients, perhaps fidelity promotions could aid to reduce the churn rate.
# As the problem can be examined more deep on Month-to-month contract types, a good idea is see the Monthly Charges and Total Charges distribution for the not churned clients of this contract. # # In[10]: df_aux = dataset.query('(Contract == "Month-to-month") and (Churn == "No")') histogram_subplot(df_aux, df_aux, "MonthlyCharges", "TotalCharges", "Charges Distribution for Month-to-month contracts for not Churned Clients", "(a) Monthtly Charges Distribution", "(b) Total Charges Distribution") # From the plots, can be said that many clients just got charged with a few values, principally for the Total Charges.
# On the following plots, the same features are analyzed, but for churned clients. # In[11]: df_aux = dataset.query('(Contract == "Month-to-month") and (Churn == "Yes")') histogram_subplot(df_aux, df_aux, "MonthlyCharges", "TotalCharges", "Charges Distribution for Month-to-month contracts for Churned Clients", "(a) Monthtly Charges Distribution", "(b) Total Charges Distribution") # Total Charges had the same behaviour, but the Monthly Charges for many churned clients was high, maybe the amount of chage value could lead the client to leave the service.
# Still on the Month-to-month contract, it's time to analyze the most used Payment methods of churned clients. # In[12]: df_aux = dataset.query(('Contract == Month-to-month') and ('Churn == "Yes"')) df_aux = df_aux["PaymentMethod"].value_counts() bar_plot(df_aux, "Payment Method of Month-to-month contract Churned Clients", "Payment Method", "Count") # Many Churned Clients used to pay with electronic checks, automatic payments, as bank transfers or credit card have a few churned clients. A good idea could make promotions to clients that use automatic payment methods.
# Lastly, the tenure of the churned clients. # In[13]: df_aux = dataset.query(('Contract == Month-to-month') and ('Churn == "Yes"')) df_aux = df_aux["tenure"].value_counts().head(5) bar_plot(df_aux, "Tenure of Month-to-month contract for Churned Clients", "Tenure", "Count") # Most clients just used the service for one month, seems like the clients used to service to check the quality or the couldn't stay for the amount of charges, as the Monthly Charges for these clients was high and the Total Charges was small, as the client just stayed a little time. # # 4. Setting up PyCaret # Before setting up PyCaret, a random sample of 10% size of the dataset will be get to make predictions with unseen data. # In[14]: data, data_unseen = data_sampling(dataset, 0.9, RANDOM_SEED) print(f"There are {data_unseen.shape[0]} samples for Unseen Data.") # The PyCaret's setup is made with 90% of data samples and just use one function (`setup`) from the module.
# It's possible configure with variuos options, as data pre-processing, feature engineering, etc. The easy and efficient of PyCaret buy a lot of time when prototyping models.
# Each setup is an experiment and for this problem, is used the following options: # * Normalization of the numerical features with Z-Score. # * Feature Selection with permutation importance techniques. # * Outliers Removal. # * Features Removal based on Multicollinearity. # * Features Scalling Transformation. # * Ignore low variance on Features. # * PCA for Dimensionality Reduction, as the dataset has many features. # * Numeric binning on the features `MonthlyCharges` and `TotalCharges`. # * 70% of samples for Train and 30% for test. # * Fix Imbalance with SMOTE. # * The models will store their metric results via Mlflow, to acess Mlflow UI, type mlflow on the your cmd or !mlflow ui on a cell on the bottom of this notebook. # In[15]: exp01 = setup(data=data, target="Churn", session_id=RANDOM_SEED, ignore_features=["customerID"], numeric_features=["SeniorCitizen"], normalize=True, feature_selection=True, remove_outliers=True, remove_multicollinearity=True, fix_imbalance=True, transformation=True, ignore_low_variance=True, pca=True, bin_numeric_features=["MonthlyCharges", "TotalCharges"], silent=True, experiment_name="customer-churn-prediction", log_experiment=True) # PyCaret shows at first if all features types are with it correspondent type, if everything is right, press enter on the blank bar and the setup is finished showing a summary of the experiment. # # 5. Model Build # A great tool on PyCaret is build many models and compare a metric for the bests!
# The models are sorted by F1 Score due Precision and Recall are importants for the evaluation.
# The cross-validation is made with 5-folds. # In[16]: top_model = compare_models(fold=K_FOLDS, sort="F1", n_select=1, blacklist=["gbc", "catboost"]) # The best model suggested by PyCaret is the Logistic Regreesion, with a F1 Score around 0.63 and a good Recall, around 0.77.
# Time to tune the model. The `choose_better` argument get the best model between the tuned and best model. # In[17]: tuned_model = tune_model(estimator=top_model, fold=K_FOLDS, optimize="F1", choose_better=True, verbose=False) # Let's see the hyperparameters of the chosen model # In[18]: plot_model(tuned_model, plot="parameter") # PyCaret also has functions to make ensembles, for this implementation, a bagged model is build. # In[19]: bagged_model = ensemble_model(tuned_model, fold=K_FOLDS) # And now a boosted model. # In[20]: boosted_model = ensemble_model(tuned_model, fold=K_FOLDS, method="Boosting") # The boosted model improved a bit the F1 Score, it's also possible make blended and stacked models with PyCaret, both models are created using the the tuned and boosted models. # In[21]: blended_model = blend_models(estimator_list=[tuned_model, boosted_model], fold=K_FOLDS) # The best model still is the boosted model.
# Let's plot some metric curves, matrices and see what is the model classifier, starting with the hyperparameters and the used model classifier. # In[22]: best_model = boosted_model plot_model(best_model, plot="parameter") print(f"Model: {type(best_model)}") # Let's plot the ROC curve, PR Curve, Confusion Matrix and Metrics for each class. # In[23]: plot_model(best_model, plot="auc") # The AUC for each class was good: 0.85. # In[24]: plot_model(best_model, plot="pr") # The PR curve got an average precision around 0.7, which is good. # In[25]: plot_model(best_model, plot="confusion_matrix") # The Confusion Matrix shows that the churned clients have been classified as not churn in 50% of the predictions # In[26]: plot_model(best_model, plot="class_report") # The model has done a good work on the metrics for the class 0 (Not Churned) but got a Precision close to 0.5 for class 1 (Churned). # # 6. Prediction on Test Data # The test is made with the remaining 30% of data that PyCaret got on the setup, it's important to see that the model is not overfitting. # In[27]: predict_model(best_model); # As everything is right with the model, it's time to finalize it fitting all the data. # In[28]: final_model = finalize_model(best_model) # # 7. Prediction on Unseen Data # The remaining 10% data is used to make predictions with unseen samples, what could include some outliers, it's how real world data works.
# Just Kappa Score is not showed, as the focus is the F1 Score, as Precision and Recall are importants to get False Positives and False Negatives.
# It's not necessary to make any transformation on the data, PyCaret do this. # In[29]: predict_unseen = predict_model(final_model, data=data_unseen); score_unseen = test_score_report(data_unseen, predict_unseen) print(score_unseen.to_string(index=False)) conf_mat(data_unseen, predict_unseen) # And the Unseen Data predicts as the trained model! The model was sucessful built! # # 8. Save Experiment and Model # PyCaret allows to save all the pipeline experiment and the model to deploy.
# It's recommended to save with date of the experiments. # In[30]: save_model(final_model, PATH+"models"+os.sep+"modelCCP_"+date.today().strftime("%m-%d-%Y")) # # 9. Conclusion # From the results and explanations presented here, some conclusion can be draw: # * The type of contract has a strict relationship with churned clients, Month-to-month contracts with high amount of charges could lead a client to leave the service. # * The Best Model is an AdaBoostClassifier with Logistic Regression as Base Estimator. # * For the predictions made by the model and based on the precision and recall scores, as F1 Score try to show a balance between these two metrics, the precision was near 50%, what means that the model predict correctly 50% of classified clients as churned, on other hand, the recall was good, where around 83% of the actually churned clients was predict correctly. # * The metrics must be used in favor of the business interests: Is needed a more correct prediction of the churned clients or get a more amount of these clients on the predictions? # # From the tools and enviroment used: # * PyCaret is incredible, it speed up the model build a lot and the pre-processing is very useful, beyond of the functions to save and deploy the model.