#!/usr/bin/env python
# coding: utf-8

# # Structure of the notebook 

# ### I. Data cleaning :
#   #### 1. Checking data types.
#   #### 2. Treatment of  missing data.
#   #### 3. Checking the existence of duplicates.
#   #### 4. Checking the existence of constant and quasi-constant features.
#   #### 5. Verification of the data range constraint for numerical variables.
#   #### 6. Investigation of the existence of rare labels in categorical variables.
# 
# ### II. Exploratory data analysis
#   #### 1. Explorating of the distribution of numerical variables.
#   #### 2. Investigation of the relationship between price sensitivity and customer  churn.
#   #### 3. Further analysis of the data to get more insights.
# 
# 
# 
# 
# 
# 
# 

#  

# In[179]:


import re 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import statsmodels.api as sm
import pylab
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures
from feature_engine.encoding import RareLabelEncoder
import scipy.stats as stats


# ## I. Data  cleaning 

# In[2]:


client_data = pd.read_csv('client_data.csv')
prices_data = pd.read_csv('price_data.csv')


# In[3]:


len(client_data)


# In[4]:


len(prices_data)


# In[10]:


prices_data.head()


# In[11]:


client_data.head()


# #### 1. Checking data types
# 

# In[13]:


# Checking for data types
client_data.info()


# In[14]:


prices_data.info()


# In[19]:


# Transforming the type of columns containing dates.

# A function to change the datatype     
def to_date(df, column):
    """ This function takes as inputs the dataframe and the column and
        convert the type of the column to  datetime  """
    
    df[column] = pd.to_datetime(df[column])


# In[33]:


# Getting date columns from the clients data set
date_cols_client = []
for col in client_data.columns:
    pattern = re.findall(r'date_\w+|\w+_date', col)
    if len(pattern) > 0:
        date_cols_client.append(pattern[0])


# In[34]:


# Getting date columns from the prices data set
date_cols_prices = []
for col in prices_data.columns:
    pattern = re.findall(r'date_\w+|\w+_date', col)
    if len(pattern) > 0:
        date_cols_prices.append(pattern[0])


# In[35]:


# Iterating through the date columns in the clients dataset  
for col in date_cols_client:
    to_date(client_data, col)


# In[36]:


# Iterating through the date columns in the prices dataset  
for col in date_cols_prices:
    to_date(prices_data, col)


# In[37]:


client_data.info()


# In[38]:


prices_data.info()


# #### 2. Treatement of  missing data 

# In[39]:


# Creating a function to get the unique values of a colum
def get_unique(df, column):
    return df[column].unique()

# A function used to verify the values used for missing values
def miss_val(df, column, value):
    """ 
    A function to get the indices of the rows with missing values
    
        Inputs:
               - The dataframe object.
               - The name of the column.
               - The value to verify.
        Outputs:
               - list containing the indices of the rows with missing values
    """
    unique_vls = df[column].unique()
    null_rows = []
    for i in range(len(unique_vls)):
        if str(unique_vls[i]) == str(value):
            # returns the index of the missing column
            null_rows.append(df.loc[df[column]==unique_vls[i], df.columns].index)
    return null_rows


# In[44]:


# Checking the following values with the clients dataset
vals = ['', '.', '?', np.nan, ' ', 'MISSING', 'missing', 'Missing', '!']

for column in client_data.columns :
    for j in vals:
        if len(miss_val(client_data, column, j))> 0:
            print("The used value  for column '{0}' is '{1}'".format(column, j))
        

# In[45]:


# Checking the following values with the prices dataset
vals = ['', '.', '?', np.nan, ' ', 'MISSING', 'missing', 'Missing', '!']

for column in prices_data.columns :
    for j in vals:
        if len(miss_val(prices_data, column, j))> 0:
            print("The used value  for column '{0}' is '{1}'".format(column, j))
        

# In[46]:


# Replacing the term "MISSING" with np.nan
client_data.loc[client_data['channel_sales']=="MISSING", 'channel_sales'] = np.nan
client_data.loc[client_data['origin_up']=="MISSING", 'origin_up'] = np.nan


# In[47]:


# Checking the following values with the clients dataset
vals = ['', '.', '?', np.nan, ' ', 'MISSING', 'missing', 'Missing', '!']

for column in client_data.columns :
    for j in vals:
        if len(miss_val(client_data, column, j))> 0:
            print("The used value  for column '{0}' is '{1}'".format(column, j))
        

# We can see that we have now "nan" as a representing value for misssing values 

# In[49]:


# Getting the percentages of missing data in each column
client_data.isna().mean()*100


# We can see that the feature "channel_sales" has more than 25% of its rows without information. So we will keep this feature apart.
# For the second feature with missing data, namely "orgin_up" we will drop the rows with missing data and keep them in a separate data frame.
# 

# In[51]:


# Keeping channel_sales apart
df_channel_sales = client_data.pop('channel_sales')


# In[55]:


# Keeping the rows with missing data apart.
df_na = client_data[client_data.isna().any(axis=1)]
client_data = client_data[~client_data.isna().any(axis=1)]


# In[56]:


client_data.isna().mean()*100


# #### 3. Checking the existence of duplicates 

# In[57]:


# Getting the duplicated columns for the client data set
dup = client_data.duplicated()
client_data[dup]


# In[60]:


# Getting the duplicated columns for the client data set
dup = prices_data.duplicated()
prices_data[dup]


# So there is no duplicated columns in both datasets
# 

# In[61]:


# Checking the existence of duplicated features for the client dataset
sel_1_clt = DropDuplicateFeatures()
sel_1_clt.fit(client_data)


# In[62]:


sel_1_clt.features_to_drop_


# So we got an empty set which means that there is no duplicated features in the client dataset

# In[65]:


# Checking the existence of duplicated features in the prices dataset
sel_1_pr = DropDuplicateFeatures()
sel_1_pr.fit(prices_data)


# In[66]:


sel_1_pr.features_to_drop_


# The same thing for the prices dataset.

# #### 4. Checking the existence of constant and quasi-constant features

# Constant and quasi-constant features contain almost no information that the model can use to discriminate between data points. So the best thing to do when have this kind of features is to drop them. 

# In[68]:


# Checking the existence of constant and quasi-constant features in the clients dataset
# Variables showing the same value in a percentage of observations greater than 
# "tol" will be considered as constant/quasi-constant and dropped.
sel_2_clt = DropConstantFeatures(tol=0.98)
sel_2_clt.fit(client_data)


# In[69]:


sel_2_clt.features_to_drop_


# So we got an empty list which means that there is no constant/quasi-constant features

# In[71]:


# Checking the existence of constant and quasi-constant features in the clients dataset

sel_2_pr = DropConstantFeatures(tol=0.98)
sel_2_pr.fit(prices_data)


# In[72]:


sel_2_pr.features_to_drop_


# We got an empty list which means that there is no constant/quasi-constant features

# ####  5. Verification of the data range constraint for numerical variables
# 

# In[99]:


# Getting the categorical features in the clients dataset
cat_ftr_clt = [col for col in client_data.columns if client_data[col].dtype == 'O']

# Getting the categorical features in the clients dataset
num_ftr_clt = [col for col in client_data.columns if col not in cat_ftr_clt + date_cols_client]


# In[100]:


# Checking the number of features 
print((len(cat_ftr_clt) + len(num_ftr_clt) + len(date_cols_client)) == len(client_data.columns))


# In[108]:


# Getting the categorical features in the clients dataset
num_ftr_pr = [col for col in prices_data.columns if prices_data[col].dtype == 'float64']

# Getting the categorical features in the clients dataset
cat_ftr_pr = [col for col in prices_data.columns if prices_data[col].dtype not in ['datetime64','float64']]


# In[103]:


# Investigating the existence of negative values in the clients dataset

for col in num_ftr_clt:
    if client_data[col].min() < 0:
        print("Feature {} has negative values".format(col))


# In[110]:


# Investigating the existence of negative values in the prices dataset

for col in num_ftr_pr:
    if prices_data[col].min() < 0:
        print("Feature {} has negative values".format(col))


# So there is no negative values in both datasets

# #### 6. Investigation of the existence of rare labels in categorical variables

# Rare labels can cause the following issues :
#     
#    - The data points with rare categories are so few that the algorithm does not get any information from them, which turns them into a source of noise that can make the model overfit the data.
#     
#    - Rare labels may appear only in the test set which will make it hard for the model to evaluate them. 
# 

# In[164]:


for col in cat_ftr_clt:
    if col != 'id':
        for cat in client_data[col].unique():
            perct = round(len(client_data[client_data[col] == cat])/len(client_data) * 100, 2)
            print("For feature '{}' the percentage of datapoints with category '{}' is: \n{} %".format(col,cat,perct))
            print("-------------------------------------------------------------------")
          

# In[169]:


# Grouping rare categories (present in less than 5% of the data points) in a new
# category called "rare"
enc = RareLabelEncoder(variables=['origin_up'],n_categories=4, tol=0.05)
enc.fit(client_data)


# In[171]:


client_data = enc.transform(client_data)
client_data['origin_up'].unique()


# ### II. Exploratory data analysis

# #### 1. Exploring  the distribution of numerical variables.

# In[182]:


client_data[num_ftr_clt].hist(figsize=(20,20))
plt.show()


# In[175]:


# Plots to assess normality 
# plot the histograms to have a quick look at the variable distribution
# histogram and Q-Q plots

def diagnostic_plots(df, column):
    
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[column].hist(bins=30)

    plt.subplot(1, 2, 2)
    stats.probplot(df[column], dist="norm", plot=plt)

    plt.show()


# In[181]:


for col in num_ftr_clt:
    print('Plots for feature {}'.format(col))
    diagnostic_plots(client_data, col)


# #### 2. Investigation of the relationship between price sensitivity and customer churn
# 

# Price sensistivity is a concept that helps us understand how the willingness to buy of a customer is affected by the price of a product or a service.
# To quantify this concept we will use the elasticity of demand that can computed using the following formula:
# 
#     Price elasticity of demand = % change in quantity demanded / % change in price
# 
# - To compute % change in quantity we will use the variables "cons_12m" and "forecast_cons_12m".
# - To compute % change in price we will use the average of the two variables "forecast_price_energy_off_peak" and "forecast_price_energy_peak" and the average of the variables "price_off_peak_var" and "price_peak_var".
# 
# 

# In[195]:


# Getting the previous year average price for each customer using the price dataset
gr_df_pr =prices_data.groupby(['id'])[prices_data.columns].mean().reset_index()


# In[198]:


# Merging the two dataframes 
new_df = client_data.merge(gr_df_pr, how='left', left_on='id', right_on='id')
new_df


# In[205]:


# Averaging forcasted peak and off-peak prices.
new_df["forcasted_price"] = (new_df["forecast_price_energy_off_peak"] + new_df["forecast_price_energy_peak"])/2

# Averaging previous peak and off-peak prices.
new_df["previous_price"] = new_df["price_off_peak_var"] + new_df["price_off_peak_fix"]


# In[212]:


# Computing the percentage change in quantity demanded
prct_qte = ((new_df['forecast_cons_12m'] - new_df['cons_12m'])/new_df['cons_12m']) * 100

# Computing the percentage change in price
prct_prc = ((new_df["forcasted_price"] - new_df["previous_price"])/new_df["previous_price"]) * 100

# Computing the price elasticity of demand
new_df['price_sens'] = prct_qte/prct_prc


# In[222]:


# Plotting the correlation matrix 
sns.heatmap(new_df[['churn', 'price_sens', 'forcasted_price','previous_price']].corr(), annot=True)
plt.show()


# We can see that there is almost no correlation between price sensitivity - and other price related variables - and customer churn.

# #### 3. Further analysis of the data to get more insights 

# In[329]:


# Getting the median price for clients who did churn and for those who did not

round_func = lambda x : round(x, 3)
churn_price = pd.DataFrame(new_df.groupby(['churn'])['previous_price'].median()).apply(round_func)
churn_price


# In[330]:


# Getting the median of the forcasted price for clients who did churn and for those who did not

churn_forc_price = pd.DataFrame(new_df.groupby(['churn'])['forcasted_price'].median()).apply(round_func).reset_index()
churn_forc_price


# In[331]:


churn_viz = churn_forc_price.merge(churn_price, how='left', on='churn').set_index('churn')
churn_viz.plot(kind='bar', figsize=(10,10))
plt.title('Median price comparison')
plt.xlabel('Churn')
plt.ylabel('Average price')
plt.xticks(ticks =np.arange(2), labels=['current customers','shurned customers'])
plt.show()


# We can see that the median price values for customers who did churn and for those who did not, are almost the same. 
# 

# In[325]:


# Getting the last month consumption of electricity for clients who did churn and for those who did not
churn_last_month = pd.DataFrame(new_df.groupby(['churn'])['cons_last_month'].mean())
churn_last_month


# In[327]:


# Code source: https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/

# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="churn", y="cons_last_month", data=churn_last_month.reset_index())
 
# Iterrating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')

plt.xlabel("Churn", size=14)
 
# Setting the label for y-axis
plt.ylabel("Last month consumption", size=14)
 
# Setting the title for the graph
plt.title("Last month consumption and churn")

plt.xticks(np.arange(2), labels=['False', 'True'])
 
# Finally showing the plot
plt.show()


# On average a churned customer consumes in total  42 % of what a not churning customer consumes in the last month before leaving.
# 

# In[323]:


# The number of churned customers who were gaz consumers
churn_has_gas = pd.DataFrame(new_df.groupby(['has_gas'])['churn'].sum())
churn_has_gas


# In[322]:


# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="has_gas", y="churn", data=churn_has_gas.reset_index())
 
# Iterrating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')

plt.xlabel("Gaz customer", size=14)
 
# Setting the label for y-axis
plt.ylabel("Number of churned customers", size=14)
 
# Setting the title for the graph
plt.title("Customer churn and Gaz")

plt.xticks(np.arange(2), labels=['False', 'True'])
 
# Finally showing the plot
plt.show()


# We can see that 85% of churned customer were not gas buyers.

# In[339]:


new_df.to_csv('clean_data_after_eda.csv')


# In[ ]: