#!/usr/bin/env python
# coding: utf-8

# # What is customer churn?
# Customer churn is defined as when customers or subscribers discontinue doing business with a firm or service.
# 
# Customers in the telecom industry can choose from a variety of service providers and actively switch from one to the next. The telecommunications business has an annual churn rate of 15-25 percent in this highly competitive market.
# 
# Individualized customer retention is tough because most firms have a large number of customers and can't afford to devote much time to each of them. The costs would be too great, outweighing the additional revenue. However, if a corporation could forecast which customers are likely to leave ahead of time, it could focus customer retention efforts only on these "high risk" clients. The ultimate goal is to expand its coverage area and retrieve more customers loyalty. The core to succeed in this market lies in the customer itself.
# 
# Customer churn is a critical metric because it is much less expensive to retain existing customers than it is to acquire new customers.
# 
# To detect early signs of potential churn, one must first develop a holistic view of the customers and their interactions across numerous channels.As a result, by addressing churn, these businesses may not only preserve their market position, but also grow and thrive. More customers they have in their network, the lower the cost of initiation and the larger the profit. As a result, the company's key focus for success is reducing client attrition and implementing effective retention strategy.
# 
# ### Objectives:
# - Finding the % of Churn Customers and customers that keep in with the active services.
# - Analysing the data in terms of various features responsible for customer Churn
# - Finding a most suited machine learning model for correct classification of Churn and Non-Churn customers.
# 
# ### Dataset:
# [Telco Customer Churn](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

# ## Exploratory Data Analysis 

# In[1]:


# Standard libraries for data analysis
import pandas as pd
import numpy as np

#Standard libraries for data visualization:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Standard libraries for encoding categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Libraries for machine learning
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report


# In[2]:


# Standard librabries for measuring performance
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report, precision_recall_curve
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.metrics import recall_score


# In[3]:


# Loading the dataset
data = pd.read_csv("data.csv")
data.head()


# In[4]:


data.isnull().any().any()


# In[5]:


data.info()


# In[6]:


data.shape


# ## Checking for missing or garbage values

# In[7]:


# Removing CustomerID as it is meaningless for our analysis as it is unique to each person
data = data.drop(["customerID"], axis = 1)
data.head()


# In[8]:


data[data["TotalCharges"] == ' ']


# In[9]:


data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
data.isnull().sum()


# There are 11 records missing in total charges

# In[10]:


data[data["tenure"] == 0]


# In[11]:


# Dropping newly acquired customers as this may not aid our prediction model
data.drop(labels=data[data["tenure"] == 0].index, axis = 0, inplace = True)


# In[12]:


# Imputing missing "Total Charges" with the mean
data.fillna(data["TotalCharges"].mean())


# In[13]:


# Checking for missing values
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
data.isnull().sum()


# In[14]:


# Checking for number of unique values in SeniorCitizen attribute
data.SeniorCitizen.unique()


# In[15]:


data.SeniorCitizen = data.SeniorCitizen.map({0: "No", 1: "Yes"})
data.head()


# In[16]:


data.InternetService.describe(include=["object", "bool"])


# ## EDA

# In[17]:


# Visualising the Yes/No Churn distribution
type_ = ["No", "yes"]
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Pie(labels=type_, values=data['Churn'].value_counts(), name="Churn"))


fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)

fig.update_layout(
    title_text="Churn Distributions",
    annotations=[dict(text='Churn', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()


# In[18]:


data.Churn[data.Churn == "No"].groupby(by = data.gender).count()


# The distibution is almost equal

# In[19]:


data.Churn[data.Churn == "Yes"].groupby(by = data.gender).count()


# The distribution is almost equal

# In[20]:


plt.figure(figsize=(6, 6))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["F","M","F","M"]
sizes_gender = [939,930 , 2544,2619]
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3) 
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}

plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, )
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, )

centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution w.r.t Gender: Male(M), Female(F)', fontsize=15, y=1.1)
 
plt.axis('equal')
plt.tight_layout()
plt.show()


# In[21]:


fig = px.histogram(data, x="Churn", color = "Contract", barmode = "group", title = "Customer contract distribution")
fig.update_layout(width=700, height=500, bargap=0.2)
fig.show()


# Customers with monthly contract are more likely to churn

# In[22]:


labels = data['PaymentMethod'].unique()
values = data['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="Payment Method Distribution")
fig.show()

fig = px.histogram(data, x="Churn", color="PaymentMethod", title="Customer Payment Method distribution w.r.t. Churn")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()


# In[23]:


fig = go.Figure()

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [965, 992, 219, 240],
  name = 'DSL',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [889, 910, 664, 633],
  name = 'Fiber optic',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [690, 717, 56, 57],
  name = 'No Internet',
))

fig.update_layout(title_text="Churn Distribution w.r.t. Internet Service and Gender")

fig.show()


# Customers with a fiber optic subscription are more likely to churn when compared to other internet services

# In[24]:


data.head()


# In[25]:


def encode_data(dataframe):
    if dataframe.dtype == "object":
        dataframe = LabelEncoder().fit_transform(dataframe)
    return dataframe

data = data.apply(lambda x: encode_data(x))
data.head()


# In[26]:


X = data.drop(columns = "Churn")
y = data["Churn"].values


# In[27]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify =y)


# In[28]:


def distplot(feature, frame, color='r'):
    plt.figure(figsize=(8,3))
    plt.title("Distribution for {}".format(feature))
    ax = sns.distplot(frame[feature], color= color)


# In[29]:


col =  ["tenure", 'MonthlyCharges', 'TotalCharges']
for features in col :distplot(features, data)


# In[30]:


data_std = pd.DataFrame(StandardScaler().fit_transform(data[col]).astype('float64'), columns = col)
for feat in col: distplot(feat, data_std, color='c')


# In[31]:


data.columns


# In[32]:


for i in data.columns:
    print(i, ": ", data[i].unique())


# In[33]:


# Divide the columns into 3 categories, one  for standardisation, one for label encoding and one for one hot encoding

cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(col) - set(cat_cols_ohe)) #those that need label encoding

print(cat_cols_le)


# In[34]:


scaler = StandardScaler()
X_train[col] = StandardScaler().fit_transform(X_train[col])
X_test[col] = StandardScaler().fit_transform(X_test[col])


# In[35]:


models = []

models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 42, class_weight='balanced')))
models.append(('SVC', SVC(kernel = 'linear', random_state = 42)))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Decision Tree Classifier', DecisionTreeClassifier(criterion = 'entropy', random_state = 42)))


# In[36]:


acc_results =[]
auc_results =[]
names = []

result_col = ["Algorithm", "ROC AUC Mean", "ROC AUC STD", "Accuracy Mean", "Accuracy STD"]
model_results = pd.DataFrame(columns = result_col)

i=0
# K- fold cross validation

for name, model in models:
    names.append(name)
    kfold = model_selection.KFold(n_splits=10)
    
    cv_acc_results = model_selection.cross_val_score(model, X_train, y_train, 
                    cv = kfold, scoring="accuracy")
    cv_auc_results = model_selection.cross_val_score(model, X_train, y_train,
                    cv = kfold, scoring="roc_auc")
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    
    model_results.loc[i] = [name, 
                           round(cv_auc_results.mean()*100,2),
                           round(cv_auc_results.std()*100,2),
                           round(cv_acc_results.mean()*100,2),
                           round(cv_acc_results.std()*100,2)]
    i+=1

model_results.sort_values(by = ['ROC AUC Mean'], ascending=False)


# 2nd Iteration

# In[37]:


#evaluation of results
def model_evaluation(y_test, y_pred, model_name):
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    f2 = fbeta_score(y_test, y_pred, beta = 2.0)

    results = pd.DataFrame([[model_name, acc, prec, rec, f1, f2]], 
                       columns = ["Model", "Accuracy", "Precision", "Recall",
                                 "F1 SCore", "F2 Score"])
    results = results.sort_values(["Precision", "Recall", "F2 Score"], ascending = False)
    return results


# In[38]:


# Logistic regression
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


#SVC
classifier2 = SVC(kernel = 'linear', random_state = 42)
classifier2.fit(X_train, y_train)
y_pred2 = classifier2.predict(X_test)

#KNN
classifier3 = KNeighborsClassifier(n_neighbors=22, metric="minkowski", p=2)
classifier3.fit(X_train, y_train)
y_pred3 = classifier3.predict(X_test)

#Naive Bayes
classifier5 = GaussianNB()
classifier5.fit(X_train, y_train)
y_pred5 = classifier5.predict(X_test)

#Decision tree
classifier6 = DecisionTreeClassifier(criterion="entropy", random_state=42)
classifier6.fit(X_train, y_train)
y_pred6 = classifier6.predict(X_test)


# In[39]:


lr = model_evaluation(y_test, y_pred, "Logistic Regression")
svm = model_evaluation(y_test, y_pred2, "SVM (Linear)")
knn = model_evaluation(y_test, y_pred3, "K-Nearest Neighbours")
nb = model_evaluation(y_test, y_pred5, "Naive Bayes")
dt = model_evaluation(y_test, y_pred6, "Decision Tree")


# In[40]:


eval_ =lr.append(svm).append(knn).append(nb).append(dt).sort_values(["Precision", 
"Recall", "F2 Score"], ascending = False).reset_index().drop(columns = "index")
eval_


# In[41]:


predictions = [y_pred, y_pred2 , y_pred3,y_pred5, y_pred6]

for i, j in zip(predictions, eval_.Model.values):
    plt.figure(figsize=(4,3))
    sns.heatmap(confusion_matrix(y_test, i),
                annot=True,fmt = "d",linecolor="k",linewidths=3)
    
    plt.title(j,fontsize=14)
    plt.show()


# In[42]:


def ROC_curve(classifier_, name, y_pred_):
    classifier_.fit(X_train, y_train) 
    probs = classifier_.predict_proba(X_test) 
    probs = probs[:, 1] 
    classifier_roc_auc = roc_auc_score(y_test, probs )
    rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, classifier_.predict_proba(X_test)[:,1])
    plt.figure(figsize=(14, 6))

    label_ = name + '(area = %0.4f)' % classifier_roc_auc
    # Plot Adaboost ROC
    plt.plot(rf_fpr, rf_tpr, 
    label=label_)
    # Plot Base Rate ROC
    plt.plot([0,1], [0,1],label='Base Rate' 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('True Positive Rate \n',horizontalalignment="center",
    fontstyle = "normal", fontsize = "medium", 
    fontfamily = "sans-serif")

    plt.xlabel('\nFalse Positive Rate \n',horizontalalignment="center",
    fontstyle = "normal", fontsize = "medium", 
    fontfamily = "sans-serif")

    plt.title('ROC Graph \n',horizontalalignment="center", 
    fontstyle = "normal", fontsize = "22", 
    fontfamily = "sans-serif")

    plt.legend(loc="lower right", fontsize = "medium")
    plt.xticks(rotation=0, horizontalalignment="center")
    plt.yticks(rotation=0, horizontalalignment="right")
    plt.show()
    
  
# In[43]:


preds = [y_pred, y_pred3, y_pred5, y_pred6]
classifiers = [classifier , classifier3, classifier5, classifier6]
model_names_ = ["Logistic Regression", "K-Nearest Neighbours","Naive Bayes",
               "Decision Tree"]

for i, j, k in zip(classifiers, model_names_, predictions):
    ROC_curve(i, j, k)