#!/usr/bin/env python # coding: utf-8 # # What is customer churn? # Customer churn is defined as when customers or subscribers discontinue doing business with a firm or service. # # Customers in the telecom industry can choose from a variety of service providers and actively switch from one to the next. The telecommunications business has an annual churn rate of 15-25 percent in this highly competitive market. # # Individualized customer retention is tough because most firms have a large number of customers and can't afford to devote much time to each of them. The costs would be too great, outweighing the additional revenue. However, if a corporation could forecast which customers are likely to leave ahead of time, it could focus customer retention efforts only on these "high risk" clients. The ultimate goal is to expand its coverage area and retrieve more customers loyalty. The core to succeed in this market lies in the customer itself. # # Customer churn is a critical metric because it is much less expensive to retain existing customers than it is to acquire new customers. # # To detect early signs of potential churn, one must first develop a holistic view of the customers and their interactions across numerous channels.As a result, by addressing churn, these businesses may not only preserve their market position, but also grow and thrive. More customers they have in their network, the lower the cost of initiation and the larger the profit. As a result, the company's key focus for success is reducing client attrition and implementing effective retention strategy. # # ### Objectives: # - Finding the % of Churn Customers and customers that keep in with the active services. # - Analysing the data in terms of various features responsible for customer Churn # - Finding a most suited machine learning model for correct classification of Churn and Non-Churn customers. # # ### Dataset: # [Telco Customer Churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) # ## Exploratory Data Analysis # In[1]: # Standard libraries for data analysis import pandas as pd import numpy as np #Standard libraries for data visualization: import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import warnings warnings.filterwarnings('ignore') # Standard libraries for encoding categorical variables from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler # Libraries for machine learning from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn import metrics from sklearn.metrics import roc_curve from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report # In[2]: # Standard librabries for measuring performance from sklearn.metrics import confusion_matrix, accuracy_score from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn import model_selection from sklearn import metrics from sklearn.metrics import classification_report, precision_recall_curve from sklearn.metrics import auc, roc_auc_score, roc_curve from sklearn.metrics import recall_score # In[3]: # Loading the dataset data = pd.read_csv("data.csv") data.head() # In[4]: data.isnull().any().any() # In[5]: data.info() # In[6]: data.shape # ## Checking for missing or garbage values # In[7]: # Removing CustomerID as it is meaningless for our analysis as it is unique to each person data = data.drop(["customerID"], axis = 1) data.head() # In[8]: data[data["TotalCharges"] == ' '] # In[9]: data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce') data.isnull().sum() # There are 11 records missing in total charges # In[10]: data[data["tenure"] == 0] # In[11]: # Dropping newly acquired customers as this may not aid our prediction model data.drop(labels=data[data["tenure"] == 0].index, axis = 0, inplace = True) # In[12]: # Imputing missing "Total Charges" with the mean data.fillna(data["TotalCharges"].mean()) # In[13]: # Checking for missing values data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce') data.isnull().sum() # In[14]: # Checking for number of unique values in SeniorCitizen attribute data.SeniorCitizen.unique() # In[15]: data.SeniorCitizen = data.SeniorCitizen.map({0: "No", 1: "Yes"}) data.head() # In[16]: data.InternetService.describe(include=["object", "bool"]) # ## EDA # In[17]: # Visualising the Yes/No Churn distribution type_ = ["No", "yes"] fig = make_subplots(rows=1, cols=1) fig.add_trace(go.Pie(labels=type_, values=data['Churn'].value_counts(), name="Churn")) fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16) fig.update_layout( title_text="Churn Distributions", annotations=[dict(text='Churn', x=0.5, y=0.5, font_size=20, showarrow=False)]) fig.show() # In[18]: data.Churn[data.Churn == "No"].groupby(by = data.gender).count() # The distibution is almost equal # In[19]: data.Churn[data.Churn == "Yes"].groupby(by = data.gender).count() # The distribution is almost equal # In[20]: plt.figure(figsize=(6, 6)) labels =["Churn: Yes","Churn:No"] values = [1869,5163] labels_gender = ["F","M","F","M"] sizes_gender = [939,930 , 2544,2619] colors = ['#ff6666', '#66b3ff'] colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6'] explode = (0.3,0.3) explode_gender = (0.1,0.1,0.1,0.1) textprops = {"fontsize":15} plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, ) plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, ) centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0) fig = plt.gcf() fig.gca().add_artist(centre_circle) plt.title('Churn Distribution w.r.t Gender: Male(M), Female(F)', fontsize=15, y=1.1) plt.axis('equal') plt.tight_layout() plt.show() # In[21]: fig = px.histogram(data, x="Churn", color = "Contract", barmode = "group", title = "Customer contract distribution") fig.update_layout(width=700, height=500, bargap=0.2) fig.show() # Customers with monthly contract are more likely to churn # In[22]: labels = data['PaymentMethod'].unique() values = data['PaymentMethod'].value_counts() fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)]) fig.update_layout(title_text="Payment Method Distribution") fig.show() fig = px.histogram(data, x="Churn", color="PaymentMethod", title="Customer Payment Method distribution w.r.t. Churn") fig.update_layout(width=700, height=500, bargap=0.1) fig.show() # In[23]: fig = go.Figure() fig.add_trace(go.Bar( x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'], ["Female", "Male", "Female", "Male"]], y = [965, 992, 219, 240], name = 'DSL', )) fig.add_trace(go.Bar( x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'], ["Female", "Male", "Female", "Male"]], y = [889, 910, 664, 633], name = 'Fiber optic', )) fig.add_trace(go.Bar( x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'], ["Female", "Male", "Female", "Male"]], y = [690, 717, 56, 57], name = 'No Internet', )) fig.update_layout(title_text="Churn Distribution w.r.t. Internet Service and Gender") fig.show() # Customers with a fiber optic subscription are more likely to churn when compared to other internet services # In[24]: data.head() # In[25]: def encode_data(dataframe): if dataframe.dtype == "object": dataframe = LabelEncoder().fit_transform(dataframe) return dataframe data = data.apply(lambda x: encode_data(x)) data.head() # In[26]: X = data.drop(columns = "Churn") y = data["Churn"].values # In[27]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify =y) # In[28]: def distplot(feature, frame, color='r'): plt.figure(figsize=(8,3)) plt.title("Distribution for {}".format(feature)) ax = sns.distplot(frame[feature], color= color) # In[29]: col = ["tenure", 'MonthlyCharges', 'TotalCharges'] for features in col :distplot(features, data) # In[30]: data_std = pd.DataFrame(StandardScaler().fit_transform(data[col]).astype('float64'), columns = col) for feat in col: distplot(feat, data_std, color='c') # In[31]: data.columns # In[32]: for i in data.columns: print(i, ": ", data[i].unique()) # In[33]: # Divide the columns into 3 categories, one for standardisation, one for label encoding and one for one hot encoding cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding cat_cols_le = list(set(X_train.columns)- set(col) - set(cat_cols_ohe)) #those that need label encoding print(cat_cols_le) # In[34]: scaler = StandardScaler() X_train[col] = StandardScaler().fit_transform(X_train[col]) X_test[col] = StandardScaler().fit_transform(X_test[col]) # In[35]: models = [] models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 42, class_weight='balanced'))) models.append(('SVC', SVC(kernel = 'linear', random_state = 42))) models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2))) models.append(('Gaussian NB', GaussianNB())) models.append(('Decision Tree Classifier', DecisionTreeClassifier(criterion = 'entropy', random_state = 42))) # In[36]: acc_results =[] auc_results =[] names = [] result_col = ["Algorithm", "ROC AUC Mean", "ROC AUC STD", "Accuracy Mean", "Accuracy STD"] model_results = pd.DataFrame(columns = result_col) i=0 # K- fold cross validation for name, model in models: names.append(name) kfold = model_selection.KFold(n_splits=10) cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring="accuracy") cv_auc_results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold, scoring="roc_auc") acc_results.append(cv_acc_results) auc_results.append(cv_auc_results) model_results.loc[i] = [name, round(cv_auc_results.mean()*100,2), round(cv_auc_results.std()*100,2), round(cv_acc_results.mean()*100,2), round(cv_acc_results.std()*100,2)] i+=1 model_results.sort_values(by = ['ROC AUC Mean'], ascending=False) # 2nd Iteration # In[37]: #evaluation of results def model_evaluation(y_test, y_pred, model_name): acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred) rec = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) f2 = fbeta_score(y_test, y_pred, beta = 2.0) results = pd.DataFrame([[model_name, acc, prec, rec, f1, f2]], columns = ["Model", "Accuracy", "Precision", "Recall", "F1 SCore", "F2 Score"]) results = results.sort_values(["Precision", "Recall", "F2 Score"], ascending = False) return results # In[38]: # Logistic regression classifier = LogisticRegression(random_state=42) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) #SVC classifier2 = SVC(kernel = 'linear', random_state = 42) classifier2.fit(X_train, y_train) y_pred2 = classifier2.predict(X_test) #KNN classifier3 = KNeighborsClassifier(n_neighbors=22, metric="minkowski", p=2) classifier3.fit(X_train, y_train) y_pred3 = classifier3.predict(X_test) #Naive Bayes classifier5 = GaussianNB() classifier5.fit(X_train, y_train) y_pred5 = classifier5.predict(X_test) #Decision tree classifier6 = DecisionTreeClassifier(criterion="entropy", random_state=42) classifier6.fit(X_train, y_train) y_pred6 = classifier6.predict(X_test) # In[39]: lr = model_evaluation(y_test, y_pred, "Logistic Regression") svm = model_evaluation(y_test, y_pred2, "SVM (Linear)") knn = model_evaluation(y_test, y_pred3, "K-Nearest Neighbours") nb = model_evaluation(y_test, y_pred5, "Naive Bayes") dt = model_evaluation(y_test, y_pred6, "Decision Tree") # In[40]: eval_ =lr.append(svm).append(knn).append(nb).append(dt).sort_values(["Precision", "Recall", "F2 Score"], ascending = False).reset_index().drop(columns = "index") eval_ # In[41]: predictions = [y_pred, y_pred2 , y_pred3,y_pred5, y_pred6] for i, j in zip(predictions, eval_.Model.values): plt.figure(figsize=(4,3)) sns.heatmap(confusion_matrix(y_test, i), annot=True,fmt = "d",linecolor="k",linewidths=3) plt.title(j,fontsize=14) plt.show() # In[42]: def ROC_curve(classifier_, name, y_pred_): classifier_.fit(X_train, y_train) probs = classifier_.predict_proba(X_test) probs = probs[:, 1] classifier_roc_auc = roc_auc_score(y_test, probs ) rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, classifier_.predict_proba(X_test)[:,1]) plt.figure(figsize=(14, 6)) label_ = name + '(area = %0.4f)' % classifier_roc_auc # Plot Adaboost ROC plt.plot(rf_fpr, rf_tpr, label=label_) # Plot Base Rate ROC plt.plot([0,1], [0,1],label='Base Rate' 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('True Positive Rate \n',horizontalalignment="center", fontstyle = "normal", fontsize = "medium", fontfamily = "sans-serif") plt.xlabel('\nFalse Positive Rate \n',horizontalalignment="center", fontstyle = "normal", fontsize = "medium", fontfamily = "sans-serif") plt.title('ROC Graph \n',horizontalalignment="center", fontstyle = "normal", fontsize = "22", fontfamily = "sans-serif") plt.legend(loc="lower right", fontsize = "medium") plt.xticks(rotation=0, horizontalalignment="center") plt.yticks(rotation=0, horizontalalignment="right") plt.show() # In[43]: preds = [y_pred, y_pred3, y_pred5, y_pred6] classifiers = [classifier , classifier3, classifier5, classifier6] model_names_ = ["Logistic Regression", "K-Nearest Neighbours","Naive Bayes", "Decision Tree"] for i, j, k in zip(classifiers, model_names_, predictions): ROC_curve(i, j, k)