#!/usr/bin/env python # coding: utf-8 # # Structure of the notebook # ### I. Data cleaning : # #### 1. Checking data types. # #### 2. Treatment of missing data. # #### 3. Checking the existence of duplicates. # #### 4. Checking the existence of constant and quasi-constant features. # #### 5. Verification of the data range constraint for numerical variables. # #### 6. Investigation of the existence of rare labels in categorical variables. # # ### II. Exploratory data analysis # #### 1. Explorating of the distribution of numerical variables. # #### 2. Investigation of the relationship between price sensitivity and customer churn. # #### 3. Further analysis of the data to get more insights. # # # # # # # # # In[179]: import re import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm import pylab from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures from feature_engine.encoding import RareLabelEncoder import scipy.stats as stats # ## I. Data cleaning # In[2]: client_data = pd.read_csv('client_data.csv') prices_data = pd.read_csv('price_data.csv') # In[3]: len(client_data) # In[4]: len(prices_data) # In[10]: prices_data.head() # In[11]: client_data.head() # #### 1. Checking data types # # In[13]: # Checking for data types client_data.info() # In[14]: prices_data.info() # In[19]: # Transforming the type of columns containing dates. # A function to change the datatype def to_date(df, column): """ This function takes as inputs the dataframe and the column and convert the type of the column to datetime """ df[column] = pd.to_datetime(df[column]) # In[33]: # Getting date columns from the clients data set date_cols_client = [] for col in client_data.columns: pattern = re.findall(r'date_\w+|\w+_date', col) if len(pattern) > 0: date_cols_client.append(pattern[0]) # In[34]: # Getting date columns from the prices data set date_cols_prices = [] for col in prices_data.columns: pattern = re.findall(r'date_\w+|\w+_date', col) if len(pattern) > 0: date_cols_prices.append(pattern[0]) # In[35]: # Iterating through the date columns in the clients dataset for col in date_cols_client: to_date(client_data, col) # In[36]: # Iterating through the date columns in the prices dataset for col in date_cols_prices: to_date(prices_data, col) # In[37]: client_data.info() # In[38]: prices_data.info() # #### 2. Treatement of missing data # In[39]: # Creating a function to get the unique values of a colum def get_unique(df, column): return df[column].unique() # A function used to verify the values used for missing values def miss_val(df, column, value): """ A function to get the indices of the rows with missing values Inputs: - The dataframe object. - The name of the column. - The value to verify. Outputs: - list containing the indices of the rows with missing values """ unique_vls = df[column].unique() null_rows = [] for i in range(len(unique_vls)): if str(unique_vls[i]) == str(value): # returns the index of the missing column null_rows.append(df.loc[df[column]==unique_vls[i], df.columns].index) return null_rows # In[44]: # Checking the following values with the clients dataset vals = ['', '.', '?', np.nan, ' ', 'MISSING', 'missing', 'Missing', '!'] for column in client_data.columns : for j in vals: if len(miss_val(client_data, column, j))> 0: print("The used value for column '{0}' is '{1}'".format(column, j)) # In[45]: # Checking the following values with the prices dataset vals = ['', '.', '?', np.nan, ' ', 'MISSING', 'missing', 'Missing', '!'] for column in prices_data.columns : for j in vals: if len(miss_val(prices_data, column, j))> 0: print("The used value for column '{0}' is '{1}'".format(column, j)) # In[46]: # Replacing the term "MISSING" with np.nan client_data.loc[client_data['channel_sales']=="MISSING", 'channel_sales'] = np.nan client_data.loc[client_data['origin_up']=="MISSING", 'origin_up'] = np.nan # In[47]: # Checking the following values with the clients dataset vals = ['', '.', '?', np.nan, ' ', 'MISSING', 'missing', 'Missing', '!'] for column in client_data.columns : for j in vals: if len(miss_val(client_data, column, j))> 0: print("The used value for column '{0}' is '{1}'".format(column, j)) # We can see that we have now "nan" as a representing value for misssing values # In[49]: # Getting the percentages of missing data in each column client_data.isna().mean()*100 # We can see that the feature "channel_sales" has more than 25% of its rows without information. So we will keep this feature apart. # For the second feature with missing data, namely "orgin_up" we will drop the rows with missing data and keep them in a separate data frame. # # In[51]: # Keeping channel_sales apart df_channel_sales = client_data.pop('channel_sales') # In[55]: # Keeping the rows with missing data apart. df_na = client_data[client_data.isna().any(axis=1)] client_data = client_data[~client_data.isna().any(axis=1)] # In[56]: client_data.isna().mean()*100 # #### 3. Checking the existence of duplicates # In[57]: # Getting the duplicated columns for the client data set dup = client_data.duplicated() client_data[dup] # In[60]: # Getting the duplicated columns for the client data set dup = prices_data.duplicated() prices_data[dup] # So there is no duplicated columns in both datasets # # In[61]: # Checking the existence of duplicated features for the client dataset sel_1_clt = DropDuplicateFeatures() sel_1_clt.fit(client_data) # In[62]: sel_1_clt.features_to_drop_ # So we got an empty set which means that there is no duplicated features in the client dataset # In[65]: # Checking the existence of duplicated features in the prices dataset sel_1_pr = DropDuplicateFeatures() sel_1_pr.fit(prices_data) # In[66]: sel_1_pr.features_to_drop_ # The same thing for the prices dataset. # #### 4. Checking the existence of constant and quasi-constant features # Constant and quasi-constant features contain almost no information that the model can use to discriminate between data points. So the best thing to do when have this kind of features is to drop them. # In[68]: # Checking the existence of constant and quasi-constant features in the clients dataset # Variables showing the same value in a percentage of observations greater than # "tol" will be considered as constant/quasi-constant and dropped. sel_2_clt = DropConstantFeatures(tol=0.98) sel_2_clt.fit(client_data) # In[69]: sel_2_clt.features_to_drop_ # So we got an empty list which means that there is no constant/quasi-constant features # In[71]: # Checking the existence of constant and quasi-constant features in the clients dataset sel_2_pr = DropConstantFeatures(tol=0.98) sel_2_pr.fit(prices_data) # In[72]: sel_2_pr.features_to_drop_ # We got an empty list which means that there is no constant/quasi-constant features # #### 5. Verification of the data range constraint for numerical variables # # In[99]: # Getting the categorical features in the clients dataset cat_ftr_clt = [col for col in client_data.columns if client_data[col].dtype == 'O'] # Getting the categorical features in the clients dataset num_ftr_clt = [col for col in client_data.columns if col not in cat_ftr_clt + date_cols_client] # In[100]: # Checking the number of features print((len(cat_ftr_clt) + len(num_ftr_clt) + len(date_cols_client)) == len(client_data.columns)) # In[108]: # Getting the categorical features in the clients dataset num_ftr_pr = [col for col in prices_data.columns if prices_data[col].dtype == 'float64'] # Getting the categorical features in the clients dataset cat_ftr_pr = [col for col in prices_data.columns if prices_data[col].dtype not in ['datetime64','float64']] # In[103]: # Investigating the existence of negative values in the clients dataset for col in num_ftr_clt: if client_data[col].min() < 0: print("Feature {} has negative values".format(col)) # In[110]: # Investigating the existence of negative values in the prices dataset for col in num_ftr_pr: if prices_data[col].min() < 0: print("Feature {} has negative values".format(col)) # So there is no negative values in both datasets # #### 6. Investigation of the existence of rare labels in categorical variables # Rare labels can cause the following issues : # # - The data points with rare categories are so few that the algorithm does not get any information from them, which turns them into a source of noise that can make the model overfit the data. # # - Rare labels may appear only in the test set which will make it hard for the model to evaluate them. # # In[164]: for col in cat_ftr_clt: if col != 'id': for cat in client_data[col].unique(): perct = round(len(client_data[client_data[col] == cat])/len(client_data) * 100, 2) print("For feature '{}' the percentage of datapoints with category '{}' is: \n{} %".format(col,cat,perct)) print("-------------------------------------------------------------------") # In[169]: # Grouping rare categories (present in less than 5% of the data points) in a new # category called "rare" enc = RareLabelEncoder(variables=['origin_up'],n_categories=4, tol=0.05) enc.fit(client_data) # In[171]: client_data = enc.transform(client_data) client_data['origin_up'].unique() # ### II. Exploratory data analysis # #### 1. Exploring the distribution of numerical variables. # In[182]: client_data[num_ftr_clt].hist(figsize=(20,20)) plt.show() # In[175]: # Plots to assess normality # plot the histograms to have a quick look at the variable distribution # histogram and Q-Q plots def diagnostic_plots(df, column): # function to plot a histogram and a Q-Q plot # side by side, for a certain variable plt.figure(figsize=(15,6)) plt.subplot(1, 2, 1) df[column].hist(bins=30) plt.subplot(1, 2, 2) stats.probplot(df[column], dist="norm", plot=plt) plt.show() # In[181]: for col in num_ftr_clt: print('Plots for feature {}'.format(col)) diagnostic_plots(client_data, col) # #### 2. Investigation of the relationship between price sensitivity and customer churn # # Price sensistivity is a concept that helps us understand how the willingness to buy of a customer is affected by the price of a product or a service. # To quantify this concept we will use the elasticity of demand that can computed using the following formula: # # Price elasticity of demand = % change in quantity demanded / % change in price # # - To compute % change in quantity we will use the variables "cons_12m" and "forecast_cons_12m". # - To compute % change in price we will use the average of the two variables "forecast_price_energy_off_peak" and "forecast_price_energy_peak" and the average of the variables "price_off_peak_var" and "price_peak_var". # # # In[195]: # Getting the previous year average price for each customer using the price dataset gr_df_pr =prices_data.groupby(['id'])[prices_data.columns].mean().reset_index() # In[198]: # Merging the two dataframes new_df = client_data.merge(gr_df_pr, how='left', left_on='id', right_on='id') new_df # In[205]: # Averaging forcasted peak and off-peak prices. new_df["forcasted_price"] = (new_df["forecast_price_energy_off_peak"] + new_df["forecast_price_energy_peak"])/2 # Averaging previous peak and off-peak prices. new_df["previous_price"] = new_df["price_off_peak_var"] + new_df["price_off_peak_fix"] # In[212]: # Computing the percentage change in quantity demanded prct_qte = ((new_df['forecast_cons_12m'] - new_df['cons_12m'])/new_df['cons_12m']) * 100 # Computing the percentage change in price prct_prc = ((new_df["forcasted_price"] - new_df["previous_price"])/new_df["previous_price"]) * 100 # Computing the price elasticity of demand new_df['price_sens'] = prct_qte/prct_prc # In[222]: # Plotting the correlation matrix sns.heatmap(new_df[['churn', 'price_sens', 'forcasted_price','previous_price']].corr(), annot=True) plt.show() # We can see that there is almost no correlation between price sensitivity - and other price related variables - and customer churn. # #### 3. Further analysis of the data to get more insights # In[329]: # Getting the median price for clients who did churn and for those who did not round_func = lambda x : round(x, 3) churn_price = pd.DataFrame(new_df.groupby(['churn'])['previous_price'].median()).apply(round_func) churn_price # In[330]: # Getting the median of the forcasted price for clients who did churn and for those who did not churn_forc_price = pd.DataFrame(new_df.groupby(['churn'])['forcasted_price'].median()).apply(round_func).reset_index() churn_forc_price # In[331]: churn_viz = churn_forc_price.merge(churn_price, how='left', on='churn').set_index('churn') churn_viz.plot(kind='bar', figsize=(10,10)) plt.title('Median price comparison') plt.xlabel('Churn') plt.ylabel('Average price') plt.xticks(ticks =np.arange(2), labels=['current customers','shurned customers']) plt.show() # We can see that the median price values for customers who did churn and for those who did not, are almost the same. # # In[325]: # Getting the last month consumption of electricity for clients who did churn and for those who did not churn_last_month = pd.DataFrame(new_df.groupby(['churn'])['cons_last_month'].mean()) churn_last_month # In[327]: # Code source: https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="churn", y="cons_last_month", data=churn_last_month.reset_index()) # Iterrating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8), textcoords='offset points') plt.xlabel("Churn", size=14) # Setting the label for y-axis plt.ylabel("Last month consumption", size=14) # Setting the title for the graph plt.title("Last month consumption and churn") plt.xticks(np.arange(2), labels=['False', 'True']) # Finally showing the plot plt.show() # On average a churned customer consumes in total 42 % of what a not churning customer consumes in the last month before leaving. # # In[323]: # The number of churned customers who were gaz consumers churn_has_gas = pd.DataFrame(new_df.groupby(['has_gas'])['churn'].sum()) churn_has_gas # In[322]: # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="has_gas", y="churn", data=churn_has_gas.reset_index()) # Iterrating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8), textcoords='offset points') plt.xlabel("Gaz customer", size=14) # Setting the label for y-axis plt.ylabel("Number of churned customers", size=14) # Setting the title for the graph plt.title("Customer churn and Gaz") plt.xticks(np.arange(2), labels=['False', 'True']) # Finally showing the plot plt.show() # We can see that 85% of churned customer were not gas buyers. # In[339]: new_df.to_csv('clean_data_after_eda.csv') # In[ ]: