import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from feature_engine.encoding import OneHotEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import PRatioEncoder
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, SmartCorrelatedSelection
from feature_engine.wrappers import SklearnTransformerWrapper
import optuna
import shap
import pickle
df = pd.read_csv('clean_data_after_eda.csv')
df["date_activ"] = pd.to_datetime(df["date_activ"], format='%Y-%m-%d')
df["date_end"] = pd.to_datetime(df["date_end"], format='%Y-%m-%d')
df["date_modif_prod"] = pd.to_datetime(df["date_modif_prod"], format='%Y-%m-%d')
df["date_renewal"] = pd.to_datetime(df["date_renewal"], format='%Y-%m-%d')
df.head(3)
Unnamed: 0 | id | cons_12m | cons_gas_12m | cons_last_month | date_activ | date_end | date_modif_prod | date_renewal | forecast_cons_12m | ... | churn | price_off_peak_var | price_peak_var | price_mid_peak_var | price_off_peak_fix | price_peak_fix | price_mid_peak_fix | forcasted_price | previous_price | price_sens | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 24011ae4ebbe3035111d65fa7c15bc57 | 0 | 54946 | 0 | 2013-06-15 | 2016-06-15 | 2015-11-01 | 2015-06-23 | 0.00 | ... | 1 | 0.124787 | 0.100749 | 0.06653 | 40.942265 | 22.35201 | 14.90134 | 0.106312 | 0.112768 | NaN |
1 | 1 | d29c2c54acc38ff3c0614d0a653813dd | 4660 | 0 | 0 | 2009-08-21 | 2016-08-30 | 2009-08-21 | 2015-08-31 | 189.95 | ... | 0 | 0.149609 | 0.007124 | 0.00000 | 44.311375 | 0.00000 | 0.00000 | 0.072855 | 0.078366 | 13.640956 |
2 | 2 | 764c75f661154dac3a6c254cd082ea7d | 544 | 0 | 0 | 2010-04-16 | 2016-04-16 | 2010-04-16 | 2015-04-17 | 47.96 | ... | 0 | 0.170512 | 0.088421 | 0.00000 | 44.385450 | 0.00000 | 0.00000 | 0.126847 | 0.129466 | 45.058910 |
3 rows × 35 columns
df = df.drop('Unnamed: 0', axis=1)
df.head()
id | cons_12m | cons_gas_12m | cons_last_month | date_activ | date_end | date_modif_prod | date_renewal | forecast_cons_12m | forecast_cons_year | ... | churn | price_off_peak_var | price_peak_var | price_mid_peak_var | price_off_peak_fix | price_peak_fix | price_mid_peak_fix | forcasted_price | previous_price | price_sens | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 24011ae4ebbe3035111d65fa7c15bc57 | 0 | 54946 | 0 | 2013-06-15 | 2016-06-15 | 2015-11-01 | 2015-06-23 | 0.00 | 0 | ... | 1 | 0.124787 | 0.100749 | 0.066530 | 40.942265 | 22.352010 | 14.901340 | 0.106312 | 0.112768 | NaN |
1 | d29c2c54acc38ff3c0614d0a653813dd | 4660 | 0 | 0 | 2009-08-21 | 2016-08-30 | 2009-08-21 | 2015-08-31 | 189.95 | 0 | ... | 0 | 0.149609 | 0.007124 | 0.000000 | 44.311375 | 0.000000 | 0.000000 | 0.072855 | 0.078366 | 13.640956 |
2 | 764c75f661154dac3a6c254cd082ea7d | 544 | 0 | 0 | 2010-04-16 | 2016-04-16 | 2010-04-16 | 2015-04-17 | 47.96 | 0 | ... | 0 | 0.170512 | 0.088421 | 0.000000 | 44.385450 | 0.000000 | 0.000000 | 0.126847 | 0.129466 | 45.058910 |
3 | bba03439a292a1e166f80264c16191cb | 1584 | 0 | 0 | 2010-03-30 | 2016-03-30 | 2010-03-30 | 2015-03-31 | 240.04 | 0 | ... | 0 | 0.151210 | 0.000000 | 0.000000 | 44.400265 | 0.000000 | 0.000000 | 0.073347 | 0.075605 | 28.408609 |
4 | 149d57cf92fc41cf94415803a877cb4b | 4425 | 0 | 526 | 2010-01-13 | 2016-03-07 | 2010-01-13 | 2015-03-09 | 445.75 | 526 | ... | 0 | 0.124174 | 0.103638 | 0.072865 | 40.688156 | 24.412893 | 16.275263 | 0.108457 | 0.113906 | 18.799168 |
5 rows × 34 columns
price_df = pd.read_csv('price_data.csv')
price_df["price_date"] = pd.to_datetime(price_df["price_date"], format='%Y-%m-%d')
price_df.head()
id | price_date | price_off_peak_var | price_peak_var | price_mid_peak_var | price_off_peak_fix | price_peak_fix | price_mid_peak_fix | |
---|---|---|---|---|---|---|---|---|
0 | 038af19179925da21a25619c5a24b745 | 2015-01-01 | 0.151367 | 0.0 | 0.0 | 44.266931 | 0.0 | 0.0 |
1 | 038af19179925da21a25619c5a24b745 | 2015-02-01 | 0.151367 | 0.0 | 0.0 | 44.266931 | 0.0 | 0.0 |
2 | 038af19179925da21a25619c5a24b745 | 2015-03-01 | 0.151367 | 0.0 | 0.0 | 44.266931 | 0.0 | 0.0 |
3 | 038af19179925da21a25619c5a24b745 | 2015-04-01 | 0.149626 | 0.0 | 0.0 | 44.266931 | 0.0 | 0.0 |
4 | 038af19179925da21a25619c5a24b745 | 2015-05-01 | 0.149626 | 0.0 | 0.0 | 44.266931 | 0.0 | 0.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14542 entries, 0 to 14541 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 14542 non-null object 1 cons_12m 14542 non-null int64 2 cons_gas_12m 14542 non-null int64 3 cons_last_month 14542 non-null int64 4 date_activ 14542 non-null datetime64[ns] 5 date_end 14542 non-null datetime64[ns] 6 date_modif_prod 14542 non-null datetime64[ns] 7 date_renewal 14542 non-null datetime64[ns] 8 forecast_cons_12m 14542 non-null float64 9 forecast_cons_year 14542 non-null int64 10 forecast_discount_energy 14542 non-null float64 11 forecast_meter_rent_12m 14542 non-null float64 12 forecast_price_energy_off_peak 14542 non-null float64 13 forecast_price_energy_peak 14542 non-null float64 14 forecast_price_pow_off_peak 14542 non-null float64 15 has_gas 14542 non-null object 16 imp_cons 14542 non-null float64 17 margin_gross_pow_ele 14542 non-null float64 18 margin_net_pow_ele 14542 non-null float64 19 nb_prod_act 14542 non-null int64 20 net_margin 14542 non-null float64 21 num_years_antig 14542 non-null int64 22 origin_up 14542 non-null object 23 pow_max 14542 non-null float64 24 churn 14542 non-null int64 25 price_off_peak_var 14542 non-null float64 26 price_peak_var 14542 non-null float64 27 price_mid_peak_var 14542 non-null float64 28 price_off_peak_fix 14542 non-null float64 29 price_peak_fix 14542 non-null float64 30 price_mid_peak_fix 14542 non-null float64 31 forcasted_price 14542 non-null float64 32 previous_price 14542 non-null float64 33 price_sens 14425 non-null float64 dtypes: datetime64[ns](4), float64(20), int64(7), object(3) memory usage: 3.8+ MB
# Group off-peak prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).agg({'price_off_peak_var': 'mean', 'price_off_peak_fix': 'mean'}).reset_index()
# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()
# Calculate the difference
diff = pd.merge(dec_prices.rename(columns={'price_off_peak_var': 'dec_1', 'price_off_peak_fix': 'dec_2'}), jan_prices.drop(columns='price_date'), on='id')
diff['offpeak_diff_dec_january_energy'] = diff['dec_1'] - diff['price_off_peak_var']
diff['offpeak_diff_dec_january_power'] = diff['dec_2'] - diff['price_off_peak_fix']
diff = diff[['id', 'offpeak_diff_dec_january_energy','offpeak_diff_dec_january_power']]
diff.head()
id | offpeak_diff_dec_january_energy | offpeak_diff_dec_january_power | |
---|---|---|---|
0 | 0002203ffbb812588b632b9e628cc38d | -0.006192 | 0.162916 |
1 | 0004351ebdd665e6ee664792efc4fd13 | -0.004104 | 0.177779 |
2 | 0010bcc39e42b3c2131ed2ce55246e3c | 0.050443 | 1.500000 |
3 | 0010ee3855fdea87602a5b7aba8e42de | -0.010018 | 0.162916 |
4 | 00114d74e963e47177db89bc70108537 | -0.003994 | -0.000001 |
We can exploit this feature by generating a new categorical feature that indicates if the price increased, decreased or did not change during the last year. there is two reasons why this is a useful thing to do:
# A function to create the new feature "price_change"
def price_change(x):
if x > 0:
return 'increase'
elif x < 0:
return 'decrease'
else:
return 'stable'
# Merging with the main dataset
new_df_ft = df.merge(diff, how='left', left_on='id', right_on='id')
new_df_ft.head()
id | cons_12m | cons_gas_12m | cons_last_month | date_activ | date_end | date_modif_prod | date_renewal | forecast_cons_12m | forecast_cons_year | ... | price_peak_var | price_mid_peak_var | price_off_peak_fix | price_peak_fix | price_mid_peak_fix | forcasted_price | previous_price | price_sens | offpeak_diff_dec_january_energy | offpeak_diff_dec_january_power | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 24011ae4ebbe3035111d65fa7c15bc57 | 0 | 54946 | 0 | 2013-06-15 | 2016-06-15 | 2015-11-01 | 2015-06-23 | 0.00 | 0 | ... | 0.100749 | 0.066530 | 40.942265 | 22.352010 | 14.901340 | 0.106312 | 0.112768 | NaN | 0.020057 | 3.700961 |
1 | d29c2c54acc38ff3c0614d0a653813dd | 4660 | 0 | 0 | 2009-08-21 | 2016-08-30 | 2009-08-21 | 2015-08-31 | 189.95 | 0 | ... | 0.007124 | 0.000000 | 44.311375 | 0.000000 | 0.000000 | 0.072855 | 0.078366 | 13.640956 | -0.003767 | 0.177779 |
2 | 764c75f661154dac3a6c254cd082ea7d | 544 | 0 | 0 | 2010-04-16 | 2016-04-16 | 2010-04-16 | 2015-04-17 | 47.96 | 0 | ... | 0.088421 | 0.000000 | 44.385450 | 0.000000 | 0.000000 | 0.126847 | 0.129466 | 45.058910 | -0.004670 | 0.177779 |
3 | bba03439a292a1e166f80264c16191cb | 1584 | 0 | 0 | 2010-03-30 | 2016-03-30 | 2010-03-30 | 2015-03-31 | 240.04 | 0 | ... | 0.000000 | 0.000000 | 44.400265 | 0.000000 | 0.000000 | 0.073347 | 0.075605 | 28.408609 | -0.004547 | 0.177779 |
4 | 149d57cf92fc41cf94415803a877cb4b | 4425 | 0 | 526 | 2010-01-13 | 2016-03-07 | 2010-01-13 | 2015-03-09 | 445.75 | 526 | ... | 0.103638 | 0.072865 | 40.688156 | 24.412893 | 16.275263 | 0.108457 | 0.113906 | 18.799168 | -0.006192 | 0.162916 |
5 rows × 36 columns
# Creating the new categorical features
new_df_ft['price_change_energy'] = new_df_ft['offpeak_diff_dec_january_energy'].apply(price_change)
new_df_ft['price_change_power'] = new_df_ft['offpeak_diff_dec_january_power'].apply(price_change)
# Dropping the first feature
new_df_ft = new_df_ft.drop(['offpeak_diff_dec_january_energy','offpeak_diff_dec_january_power'], axis=1)
# Getting the activation year
new_df_ft['activ_year'] = new_df_ft['date_activ'].dt.year
# Getting the activation month
new_df_ft['activ_month'] = new_df_ft['date_activ'].dt.month
# Getting the ending year
new_df_ft['end_year'] = new_df_ft['date_end'].dt.year
# Getting the ending month
new_df_ft['end_month'] = new_df_ft['date_end'].dt.month
# Getting the year of the last modification of the product
new_df_ft['modif_prod_year'] = new_df_ft['date_modif_prod'].dt.year
# Getting the month of the last modification of the product
new_df_ft['modif_prod_month'] = new_df_ft['date_modif_prod'].dt.month
# Getting the renewal year
new_df_ft['renewal_year'] = new_df_ft['date_renewal'].dt.year
# Getting the renewal month
new_df_ft['renewal_month'] = new_df_ft['date_renewal'].dt.month
# Difference between activation and renewal in days
new_df_ft['diff_act_renew'] = (new_df_ft['date_renewal'] - new_df_ft['date_activ']).dt.days
# Difference between activation and end in days
new_df_ft['diff_act_end'] = (new_df_ft['date_end'] - new_df_ft['date_activ']).dt.days
# Difference between activation and production modification.
new_df_ft['diff_act_modif'] = (new_df_ft['date_modif_prod'] - new_df_ft['date_activ']).dt.days
# Difference between end and production modification.
new_df_ft['diff_end_modif'] = (new_df_ft['date_end'] - new_df_ft['date_modif_prod']).dt.days
# Getting the average consumption per month for the past 12 months.
new_df_ft['avrg_month_cons'] = new_df_ft['cons_12m']/12
# Getting the average consumption per month of gaz for the past 12 months.
new_df_ft['avrg_month_cons_gaz'] = new_df_ft['cons_gas_12m']/12
# Getting the average forcasted consumption per month for the next 12 months.
new_df_ft['forcast_avrg_month_cons'] = new_df_ft['forecast_cons_12m']/12
# Getting the ratio of the last month consumption to the last 12m consumption
new_df_ft['ratio_last_month_last12m_cons'] = new_df_ft['cons_last_month']/new_df_ft['cons_12m']
# Getting the ratio of the last month consumption to the the average consumption per month for the past 12 months.
new_df_ft['ratio_last_month_avg_cons'] = new_df_ft['cons_last_month']/new_df_ft['avrg_month_cons']
Exploring the distribution of the target variable
# Checking if the target variable is balanced or not.
sns.countplot(x='churn', data=new_df_ft)
plt.show()
We can see that our target variable is imbalanced, which means that we need to use the "f-1" score as an evaluation metric. This will help us get an accurate idea about the performance of our model. Using "accuracy" as a metric will result in choosing the wrong model. The "why" behind this statement can be explained by the example of a dummy model that always predicts the most common class. This model will be considered as a good model in an evaluation process where we use "accuracy" as a metric.
The way we handle outliers depends on the context. For example, in the case of datasets where we have an important number of numerical features with a lot of outliers (like in our case), removing the outliers will result in losing a lot of information and capping them will increase the risk of distorting the distribution of the variable or the relationship between variables. So the right thing to do in this case is to use models that are robust to outliers such as the Random Forest model.
# Getting numerical features
cols_num = [col for col in new_df_ft.columns if new_df_ft[col].dtype in ['int64', 'float64']]
for col in cols_num:
print('boxplot of feature {}'.format(col))
print('--------------------------------------')
sns.boxplot(y=col, data=new_df_ft)
plt.show()
print('--------------------------------------')
boxplot of feature cons_12m --------------------------------------
-------------------------------------- boxplot of feature cons_gas_12m --------------------------------------
-------------------------------------- boxplot of feature cons_last_month --------------------------------------
-------------------------------------- boxplot of feature forecast_cons_12m --------------------------------------
-------------------------------------- boxplot of feature forecast_cons_year --------------------------------------
-------------------------------------- boxplot of feature forecast_discount_energy --------------------------------------
-------------------------------------- boxplot of feature forecast_meter_rent_12m --------------------------------------
-------------------------------------- boxplot of feature forecast_price_energy_off_peak --------------------------------------
-------------------------------------- boxplot of feature forecast_price_energy_peak --------------------------------------
-------------------------------------- boxplot of feature forecast_price_pow_off_peak --------------------------------------
-------------------------------------- boxplot of feature imp_cons --------------------------------------
-------------------------------------- boxplot of feature margin_gross_pow_ele --------------------------------------
-------------------------------------- boxplot of feature margin_net_pow_ele --------------------------------------
-------------------------------------- boxplot of feature nb_prod_act --------------------------------------
-------------------------------------- boxplot of feature net_margin --------------------------------------
-------------------------------------- boxplot of feature num_years_antig --------------------------------------
-------------------------------------- boxplot of feature pow_max --------------------------------------
-------------------------------------- boxplot of feature churn --------------------------------------
-------------------------------------- boxplot of feature price_off_peak_var --------------------------------------
-------------------------------------- boxplot of feature price_peak_var --------------------------------------
-------------------------------------- boxplot of feature price_mid_peak_var --------------------------------------
-------------------------------------- boxplot of feature price_off_peak_fix --------------------------------------
-------------------------------------- boxplot of feature price_peak_fix --------------------------------------
-------------------------------------- boxplot of feature price_mid_peak_fix --------------------------------------
-------------------------------------- boxplot of feature forcasted_price --------------------------------------
-------------------------------------- boxplot of feature previous_price --------------------------------------
-------------------------------------- boxplot of feature price_sens --------------------------------------
-------------------------------------- boxplot of feature activ_year --------------------------------------
-------------------------------------- boxplot of feature activ_month --------------------------------------
-------------------------------------- boxplot of feature end_year --------------------------------------
-------------------------------------- boxplot of feature end_month --------------------------------------
-------------------------------------- boxplot of feature modif_prod_year --------------------------------------
-------------------------------------- boxplot of feature modif_prod_month --------------------------------------
-------------------------------------- boxplot of feature renewal_year --------------------------------------
-------------------------------------- boxplot of feature renewal_month --------------------------------------
-------------------------------------- boxplot of feature diff_act_renew --------------------------------------
-------------------------------------- boxplot of feature diff_act_end --------------------------------------
-------------------------------------- boxplot of feature diff_act_modif --------------------------------------
-------------------------------------- boxplot of feature diff_end_modif --------------------------------------
-------------------------------------- boxplot of feature avrg_month_cons --------------------------------------
-------------------------------------- boxplot of feature avrg_month_cons_gaz --------------------------------------
-------------------------------------- boxplot of feature forcast_avrg_month_cons --------------------------------------
-------------------------------------- boxplot of feature ratio_last_month_last12m_cons --------------------------------------
-------------------------------------- boxplot of feature ratio_last_month_avg_cons --------------------------------------
--------------------------------------
We will use the correlation score between features to do feature selection in order to reduce the number of continuous variables. The selection of features will be based on model performance which is expressed in terms of our evaluation metric "f-1".
# Getting numerical features
num_ftr = [col for col in new_df_ft.columns if new_df_ft[col].dtype in ['int64', 'float64']]
# Getting datetime features
date_ftr = [col for col in new_df_ft.columns if new_df_ft[col].dtype == 'datetime64[ns]']
# Getting categorical features
cat_ftr = [col for col in new_df_ft.columns if col not in num_ftr + date_ftr and col != 'id']
print(len(num_ftr)+len(cat_ftr))
print(len(new_df_ft.columns))
48 53
# Checking the existence of "inf" values
new_df_ft.replace([np.inf, -np.inf], np.nan, inplace=True)
# Dropping "Nan" values that resulted from the feature engineering process
new_df_ft = new_df_ft.dropna()
# Creating an initial feature selection pipeline to eleminate correlated features
sel_1 = SmartCorrelatedSelection(selection_method='model_performance', estimator=RandomForestClassifier(), scoring='f1', cv=5, threshold=0.8)
sel_1.fit(new_df_ft[num_ftr], new_df_ft['churn'])
len(sel_1.features_to_drop_)
21
# Transforming our dataset
df_1 = sel_1.transform(new_df_ft[num_ftr])
df_1.head()
cons_12m | cons_gas_12m | forecast_cons_12m | forecast_discount_energy | forecast_meter_rent_12m | imp_cons | margin_gross_pow_ele | nb_prod_act | net_margin | pow_max | ... | previous_price | price_sens | end_year | modif_prod_month | renewal_year | renewal_month | diff_act_end | diff_act_modif | diff_end_modif | ratio_last_month_last12m_cons | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 4660 | 0 | 189.95 | 0.0 | 16.27 | 0.00 | 16.38 | 1 | 18.89 | 13.800 | ... | 0.078366 | 13.640956 | 2016 | 8 | 2015 | 8 | 2566 | 0 | 2566 | 0.000000 |
2 | 544 | 0 | 47.96 | 0.0 | 38.72 | 0.00 | 28.60 | 1 | 6.60 | 13.856 | ... | 0.129466 | 45.058910 | 2016 | 4 | 2015 | 4 | 2192 | 0 | 2192 | 0.000000 |
3 | 1584 | 0 | 240.04 | 0.0 | 19.83 | 0.00 | 30.22 | 1 | 25.46 | 13.200 | ... | 0.075605 | 28.408609 | 2016 | 3 | 2015 | 3 | 2192 | 0 | 2192 | 0.000000 |
4 | 4425 | 0 | 445.75 | 0.0 | 131.73 | 52.32 | 44.91 | 1 | 47.98 | 19.800 | ... | 0.113906 | 18.799168 | 2016 | 1 | 2015 | 3 | 2245 | 0 | 2245 | 0.118870 |
5 | 8302 | 0 | 796.94 | 0.0 | 30.12 | 181.21 | 33.12 | 1 | 118.89 | 13.200 | ... | 0.128293 | 40.841311 | 2016 | 11 | 2015 | 12 | 1827 | 1423 | 404 | 0.240665 |
5 rows × 23 columns
# Concatenating the dataframes without the "id" and datetime columns
df_full = pd.concat([new_df_ft[cat_ftr], df_1], axis=1)
df_full
has_gas | origin_up | price_change_energy | price_change_power | cons_12m | cons_gas_12m | forecast_cons_12m | forecast_discount_energy | forecast_meter_rent_12m | imp_cons | ... | previous_price | price_sens | end_year | modif_prod_month | renewal_year | renewal_month | diff_act_end | diff_act_modif | diff_end_modif | ratio_last_month_last12m_cons | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | increase | 4660 | 0 | 189.95 | 0.0 | 16.27 | 0.00 | ... | 0.078366 | 13.640956 | 2016 | 8 | 2015 | 8 | 2566 | 0 | 2566 | 0.000000 |
2 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | increase | 544 | 0 | 47.96 | 0.0 | 38.72 | 0.00 | ... | 0.129466 | 45.058910 | 2016 | 4 | 2015 | 4 | 2192 | 0 | 2192 | 0.000000 |
3 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | increase | 1584 | 0 | 240.04 | 0.0 | 19.83 | 0.00 | ... | 0.075605 | 28.408609 | 2016 | 3 | 2015 | 3 | 2192 | 0 | 2192 | 0.000000 |
4 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | increase | 4425 | 0 | 445.75 | 0.0 | 131.73 | 52.32 | ... | 0.113906 | 18.799168 | 2016 | 1 | 2015 | 3 | 2245 | 0 | 2245 | 0.118870 |
5 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | decrease | 8302 | 0 | 796.94 | 0.0 | 30.12 | 181.21 | ... | 0.128293 | 40.841311 | 2016 | 11 | 2015 | 12 | 1827 | 1423 | 404 | 0.240665 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
14537 | t | lxidpiddsbxsbosboudacockeimpuepw | decrease | increase | 32270 | 47940 | 4648.01 | 0.0 | 18.57 | 0.00 | ... | 0.072062 | 21.198938 | 2016 | 5 | 2014 | 5 | 1445 | 1079 | 366 | 0.000000 |
14538 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | increase | 7223 | 0 | 631.69 | 0.0 | 144.03 | 15.94 | ... | 0.101103 | 18.186301 | 2016 | 8 | 2015 | 8 | 1461 | 0 | 1461 | 0.025059 |
14539 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | increase | 1844 | 0 | 190.39 | 0.0 | 129.60 | 18.05 | ... | 0.114066 | 18.237929 | 2016 | 2 | 2015 | 2 | 1460 | 0 | 1460 | 0.097072 |
14540 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | increase | 131 | 0 | 19.34 | 0.0 | 7.18 | 0.00 | ... | 0.078366 | 12.121174 | 2016 | 8 | 2015 | 8 | 1461 | 0 | 1461 | 0.000000 |
14541 | f | ldkssxwpmemidmecebumciepifcamkci | decrease | decrease | 8730 | 0 | 762.41 | 0.0 | 1.07 | 0.00 | ... | 0.128003 | 501.481008 | 2016 | 12 | 2015 | 12 | 2556 | 0 | 2556 | 0.000000 |
14340 rows × 27 columns
# Getting the target 'churn'
y = df_full.pop('churn')
y
1 0 2 0 3 0 4 0 5 1 .. 14537 0 14538 1 14539 1 14540 0 14541 0 Name: churn, Length: 14340, dtype: int64
# Splitting the dataset into train and test sets
train_X, test_X, train_y, test_y = train_test_split(df_full, y, test_size=0.30)
print(len(train_X), len(test_X), len(train_y), len(test_y))
10038 4302 10038 4302
# Getting the names of the features after the first phase of feature selection
# Getting numerical features
num_ftr = [col for col in df_full.columns if df_full[col].dtype in ['int64', 'float64']]
# Getting datetime features
date_ftr = [col for col in df_full.columns if df_full[col].dtype == 'datetime64[ns]']
# Getting categorical features
cat_ftr = [col for col in df_full.columns if col not in num_ftr + date_ftr and col != 'id']
print(len(num_ftr)+len(cat_ftr))
print(len(new_df_ft.columns))
26 53
The reasons why Random forest is a good choice are :
Having an important number of features (like the case of our dataset) can increase the complexity of the model which may cause the problem of overfitting the dataset. Random forest, which is a bagging algorithm will be a good choice for our dataset because bagging algorithms tend to reduce variance (overfitting).
It is a good choice because it can handle outliers for the reason that decision trees are robust to outliers because they make decisions by asking if a variable x is >= than a certain value, and therefore the outlier will fall on one of the sides of the equation.
The main disadvantage of random forest models is the large inference time which makes the model slow when doing predictions.
# A function to fit and tune Random Forest
def objective(trial):
# Defining the scalers space
scalers = trial.suggest_categorical('scalers', ['minmax', 'standard', 'robust'])
if scalers == 'minmax':
scaler = MinMaxScaler()
elif scalers == 'robust':
scaler = RobustScaler()
else:
scaler = StandardScaler()
# Defining the encoder space
encoders = trial.suggest_categorical('encoders', ['Ordinal', 'OneHot', 'Count', 'Mean', 'PRatio'])
if encoders == "Ordinal":
encoder = OrdinalEncoder()
elif encoders == 'OneHot':
encoder = OneHotEncoder()
elif encoders == "Count":
encoder = CountFrequencyEncoder()
elif encoders == 'Mean':
encoder = MeanEncoder()
else :
encoder = PRatioEncoder()
# Defining the n_estimators space
n_estims = trial.suggest_int('n_estimators', low=20, high=400)
# Defining the max_depth space
max_dep = trial.suggest_int('max_depth', low=2, high=14)
# Defining the number of features space for the second phase of feature selection using mutual inforamtion selector
k = trial.suggest_int('k', low=5, high=len(train_X.columns))
# Preprocessing steps for the numerical variables
num_preproc = Pipeline(steps=[('scaler',scaler)])
# Preprocessing steps for the categorical variables
cat_preproc = Pipeline(steps=[('encoder', encoder)])
preprocessor = ColumnTransformer(transformers=[('numerical_preprocessor',num_preproc, num_ftr),
('cat_preprocessor', cat_preproc, cat_ftr)])
model_pipe = Pipeline(steps=[("preprocessing", preprocessor),
('selector', SelectKBest(mutual_info_classif, k=k)),
("model", RandomForestClassifier(n_estimators=n_estims, max_depth=max_dep, max_features=None))])
cv_score = cross_val_score(model_pipe, train_X, train_y, cv=5, scoring ='f1').mean()
return cv_score
study_1 = optuna.create_study(direction='maximize')
study_1.optimize(objective, n_trials=200, show_progress_bar=True)
[I 2022-11-07 22:24:23,896] A new study created in memory with name: no-name-2ed35ba7-b223-422f-ab69-f7950ec3ee84
Progress bar is experimental (supported from v1.2.0). The interface can change in the future.
0%| | 0/200 [00:00<?, ?it/s]
[I 2022-11-07 22:25:46,724] Trial 0 finished with value: 0.02000050001250031 and parameters: {'scalers': 'standard', 'encoders': 'PRatio', 'n_estimators': 59, 'max_depth': 6, 'k': 21}. Best is trial 0 with value: 0.02000050001250031. [I 2022-11-07 22:29:56,720] Trial 1 finished with value: 0.08001219496421315 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 157, 'max_depth': 13, 'k': 19}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:34:40,034] Trial 2 finished with value: 0.019491995314330338 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 368, 'max_depth': 8, 'k': 13}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:38:05,982] Trial 3 finished with value: 0.0040405071040481595 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 375, 'max_depth': 8, 'k': 10}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:42:06,856] Trial 4 finished with value: 0.04516023479961177 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 296, 'max_depth': 14, 'k': 9}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:43:46,343] Trial 5 finished with value: 0.011872418768970494 and parameters: {'scalers': 'standard', 'encoders': 'OneHot', 'n_estimators': 173, 'max_depth': 7, 'k': 19}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:44:36,023] Trial 6 finished with value: 0.0 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 189, 'max_depth': 2, 'k': 15}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:44:54,949] Trial 7 finished with value: 0.0 and parameters: {'scalers': 'robust', 'encoders': 'Ordinal', 'n_estimators': 50, 'max_depth': 2, 'k': 5}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:48:24,059] Trial 8 finished with value: 0.027297942063936216 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 324, 'max_depth': 10, 'k': 16}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:48:58,245] Trial 9 finished with value: 0.0 and parameters: {'scalers': 'standard', 'encoders': 'Mean', 'n_estimators': 154, 'max_depth': 2, 'k': 12}. Best is trial 1 with value: 0.08001219496421315. [I 2022-11-07 22:50:56,337] Trial 10 finished with value: 0.12215848038837698 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 107, 'max_depth': 14, 'k': 25}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 22:53:10,109] Trial 11 finished with value: 0.11695928527011001 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 116, 'max_depth': 14, 'k': 26}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 22:54:51,698] Trial 12 finished with value: 0.09870864496180518 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 98, 'max_depth': 12, 'k': 26}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 22:58:36,481] Trial 13 finished with value: 0.07546092982217026 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 246, 'max_depth': 11, 'k': 25}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:00:24,815] Trial 14 finished with value: 0.05992097537630079 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 106, 'max_depth': 10, 'k': 23}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:02:53,016] Trial 15 finished with value: 0.11266373171297536 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 116, 'max_depth': 14, 'k': 23}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:05:15,838] Trial 16 finished with value: 0.022061706208883265 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 237, 'max_depth': 5, 'k': 26}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:06:00,640] Trial 17 finished with value: 0.09390634381647181 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 28, 'max_depth': 12, 'k': 23}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:07:56,175] Trial 18 finished with value: 0.0786826175574421 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 94, 'max_depth': 14, 'k': 19}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:10:07,674] Trial 19 finished with value: 0.062054949862734896 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 134, 'max_depth': 10, 'k': 21}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:13:15,764] Trial 20 finished with value: 0.09647706285357371 and parameters: {'scalers': 'robust', 'encoders': 'Ordinal', 'n_estimators': 207, 'max_depth': 12, 'k': 24}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:14:41,974] Trial 21 finished with value: 0.10858109491606124 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 124, 'max_depth': 14, 'k': 22}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:15:47,513] Trial 22 finished with value: 0.10886685506656094 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 82, 'max_depth': 13, 'k': 26}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:16:12,181] Trial 23 finished with value: 0.11625985415430269 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 24, 'max_depth': 13, 'k': 24}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:16:40,918] Trial 24 finished with value: 0.062044688969471 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 36, 'max_depth': 13, 'k': 17}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:17:28,059] Trial 25 finished with value: 0.03703005843905579 and parameters: {'scalers': 'robust', 'encoders': 'OneHot', 'n_estimators': 69, 'max_depth': 11, 'k': 24}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:17:53,263] Trial 26 finished with value: 0.0973557369321169 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 27, 'max_depth': 13, 'k': 21}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:18:40,213] Trial 27 finished with value: 0.07653404205858913 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 68, 'max_depth': 11, 'k': 25}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:19:45,924] Trial 28 finished with value: 0.04122113964670353 and parameters: {'scalers': 'standard', 'encoders': 'PRatio', 'n_estimators': 137, 'max_depth': 9, 'k': 20}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:20:10,719] Trial 29 finished with value: 0.026003158991630766 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 56, 'max_depth': 5, 'k': 22}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:22:45,833] Trial 30 finished with value: 0.09316126568338072 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 247, 'max_depth': 12, 'k': 24}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:24:20,333] Trial 31 finished with value: 0.10974882029041681 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 123, 'max_depth': 14, 'k': 23}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:26:31,086] Trial 32 finished with value: 0.11043810658008321 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 160, 'max_depth': 14, 'k': 25}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:28:01,014] Trial 33 finished with value: 0.1021883855657579 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 109, 'max_depth': 13, 'k': 26}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:30:11,767] Trial 34 finished with value: 0.1096204786532144 and parameters: {'scalers': 'robust', 'encoders': 'PRatio', 'n_estimators': 190, 'max_depth': 14, 'k': 18}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:31:10,930] Trial 35 finished with value: 0.11951040700849598 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 75, 'max_depth': 13, 'k': 22}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:32:10,102] Trial 36 finished with value: 0.05373259920606606 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 81, 'max_depth': 13, 'k': 20}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:32:49,051] Trial 37 finished with value: 0.087402819350508 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 48, 'max_depth': 12, 'k': 22}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:33:10,495] Trial 38 finished with value: 0.06197393344046702 and parameters: {'scalers': 'standard', 'encoders': 'Mean', 'n_estimators': 20, 'max_depth': 9, 'k': 24}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:33:52,251] Trial 39 finished with value: 0.02576296151782666 and parameters: {'scalers': 'robust', 'encoders': 'Ordinal', 'n_estimators': 82, 'max_depth': 7, 'k': 21}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:35:13,633] Trial 40 finished with value: 0.043185322728012364 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 148, 'max_depth': 13, 'k': 14}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:37:01,722] Trial 41 finished with value: 0.11407024920445999 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 115, 'max_depth': 14, 'k': 25}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:37:46,804] Trial 42 finished with value: 0.11212327723263163 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 47, 'max_depth': 13, 'k': 25}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:38:26,873] Trial 43 finished with value: 0.05541263226549953 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 67, 'max_depth': 14, 'k': 10}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:40:00,389] Trial 44 finished with value: 0.07648470383910537 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 94, 'max_depth': 11, 'k': 26}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:41:40,351] Trial 45 finished with value: 0.023795445438638325 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 169, 'max_depth': 12, 'k': 5}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:43:05,339] Trial 46 finished with value: 0.10819682223626667 and parameters: {'scalers': 'standard', 'encoders': 'Mean', 'n_estimators': 46, 'max_depth': 14, 'k': 25}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:45:33,555] Trial 47 finished with value: 0.033124949802665206 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 332, 'max_depth': 13, 'k': 7}. Best is trial 10 with value: 0.12215848038837698. [I 2022-11-07 23:50:33,707] Trial 48 finished with value: 0.12450634895656347 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 399, 'max_depth': 14, 'k': 23}. Best is trial 48 with value: 0.12450634895656347. [I 2022-11-07 23:51:41,141] Trial 49 finished with value: 0.0 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 314, 'max_depth': 3, 'k': 23}. Best is trial 48 with value: 0.12450634895656347. [I 2022-11-07 23:55:29,906] Trial 50 finished with value: 0.08603689910259196 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 396, 'max_depth': 12, 'k': 20}. Best is trial 48 with value: 0.12450634895656347. [I 2022-11-07 23:57:49,343] Trial 51 finished with value: 0.12499983435316568 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 183, 'max_depth': 14, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:01:34,075] Trial 52 finished with value: 0.11100715872215643 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 268, 'max_depth': 13, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:11:39,523] Trial 53 finished with value: 0.11400206709405183 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 356, 'max_depth': 14, 'k': 22}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:15:40,318] Trial 54 finished with value: 0.11418972138693484 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 187, 'max_depth': 13, 'k': 23}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:20:19,038] Trial 55 finished with value: 0.11234842911618251 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 215, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:22:57,834] Trial 56 finished with value: 0.04093122669996963 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 149, 'max_depth': 12, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:27:48,415] Trial 57 finished with value: 0.08751418373014593 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 282, 'max_depth': 13, 'k': 22}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:30:27,497] Trial 58 finished with value: 0.11032404316551223 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 136, 'max_depth': 14, 'k': 19}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:32:17,892] Trial 59 finished with value: 0.0952778588489301 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 96, 'max_depth': 11, 'k': 23}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:35:29,990] Trial 60 finished with value: 0.03494684181754932 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 218, 'max_depth': 12, 'k': 16}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:39:03,967] Trial 61 finished with value: 0.1175095397269917 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 187, 'max_depth': 13, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:42:39,034] Trial 62 finished with value: 0.10396139527848065 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 183, 'max_depth': 13, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:47:02,469] Trial 63 finished with value: 0.12492877492877492 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 199, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:51:09,761] Trial 64 finished with value: 0.1173680228396123 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 205, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 00:55:43,234] Trial 65 finished with value: 0.11577191340445274 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 228, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:00:38,138] Trial 66 finished with value: 0.10522828026535509 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 204, 'max_depth': 14, 'k': 21}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:05:27,315] Trial 67 finished with value: 0.119321724704744 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 198, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:10:55,870] Trial 68 finished with value: 0.11235073778796667 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 256, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:14:04,225] Trial 69 finished with value: 0.1021562241022966 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 173, 'max_depth': 13, 'k': 23}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:18:18,087] Trial 70 finished with value: 0.0983348373734964 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 234, 'max_depth': 13, 'k': 22}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:22:29,894] Trial 71 finished with value: 0.12309811654917335 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 195, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:26:55,250] Trial 72 finished with value: 0.11896161450484044 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 202, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:32:20,567] Trial 73 finished with value: 0.11739079655402193 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 197, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 01:38:12,746] Trial 74 finished with value: 0.11431911049496948 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 224, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 12:55:16,306] Trial 75 finished with value: 0.12121624046162985 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 170, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 12:58:47,195] Trial 76 finished with value: 0.1229774433208767 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 166, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:02:49,643] Trial 77 finished with value: 0.08387298497270598 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 164, 'max_depth': 13, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:05:52,395] Trial 78 finished with value: 0.12295939952766674 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 180, 'max_depth': 14, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:07:27,278] Trial 79 finished with value: 0.043370787580910175 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 175, 'max_depth': 14, 'k': 12}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:08:23,854] Trial 80 finished with value: 0.026012832736104267 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 154, 'max_depth': 5, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:10:22,485] Trial 81 finished with value: 0.11971846845977804 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 145, 'max_depth': 14, 'k': 23}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:12:19,478] Trial 82 finished with value: 0.12124659715329726 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 128, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:13:58,586] Trial 83 finished with value: 0.11763272048987843 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 128, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:16:16,232] Trial 84 finished with value: 0.11243584228368761 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 175, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:18:13,008] Trial 85 finished with value: 0.09428554282479884 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 161, 'max_depth': 13, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:19:53,500] Trial 86 finished with value: 0.08578836111680639 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 136, 'max_depth': 12, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:20:46,792] Trial 87 finished with value: 0.039686584767997184 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 105, 'max_depth': 7, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:23:12,057] Trial 88 finished with value: 0.12416383395885273 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 181, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:24:05,313] Trial 89 finished with value: 0.00797105091219782 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 180, 'max_depth': 4, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:26:25,070] Trial 90 finished with value: 0.11760317313385996 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 195, 'max_depth': 13, 'k': 24}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:28:38,382] Trial 91 finished with value: 0.11606558031791676 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 167, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:43:19,785] Trial 92 finished with value: 0.11240287691262038 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 213, 'max_depth': 14, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:45:50,615] Trial 93 finished with value: 0.1179314951463877 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 155, 'max_depth': 14, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:47:40,917] Trial 94 finished with value: 0.10048946333283904 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 123, 'max_depth': 13, 'k': 26}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:50:14,494] Trial 95 finished with value: 0.11096815115326761 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 192, 'max_depth': 13, 'k': 25}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:52:07,649] Trial 96 finished with value: 0.06929823177923725 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 143, 'max_depth': 14, 'k': 23}. Best is trial 51 with value: 0.12499983435316568. [I 2022-11-08 13:54:31,081] Trial 97 finished with value: 0.13111670517989565 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 180, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 13:56:28,274] Trial 98 finished with value: 0.05653999955419834 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 185, 'max_depth': 9, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 13:59:15,140] Trial 99 finished with value: 0.11229480031157509 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 209, 'max_depth': 13, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:03:20,328] Trial 100 finished with value: 0.12306967417485255 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 248, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:08:58,860] Trial 101 finished with value: 0.11031284717762942 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 339, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:12:48,992] Trial 102 finished with value: 0.11588333330237263 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 242, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:17:55,964] Trial 103 finished with value: 0.11741549220411811 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 300, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:21:06,779] Trial 104 finished with value: 0.0979050726590249 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 222, 'max_depth': 14, 'k': 18}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:23:58,609] Trial 105 finished with value: 0.10891686446178514 and parameters: {'scalers': 'standard', 'encoders': 'Count', 'n_estimators': 178, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:30:02,889] Trial 106 finished with value: 0.11745579483174078 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 393, 'max_depth': 14, 'k': 22}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:35:45,585] Trial 107 finished with value: 0.10040406764534034 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 378, 'max_depth': 13, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:38:27,943] Trial 108 finished with value: 0.11580597491812443 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 155, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:39:21,671] Trial 109 finished with value: 0.03518696104387924 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 112, 'max_depth': 13, 'k': 6}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:40:52,025] Trial 110 finished with value: 0.09765850470346116 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 87, 'max_depth': 14, 'k': 21}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:43:58,315] Trial 111 finished with value: 0.11869383157398591 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 169, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:47:21,352] Trial 112 finished with value: 0.11937099523752684 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 192, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:50:43,962] Trial 113 finished with value: 0.1112225528567653 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 182, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:53:21,113] Trial 114 finished with value: 0.09674865865765195 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 162, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:56:59,005] Trial 115 finished with value: 0.11952708479907118 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 202, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 14:58:42,137] Trial 116 finished with value: 0.05501221633614946 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 129, 'max_depth': 8, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:01:41,750] Trial 117 finished with value: 0.12102451588182368 and parameters: {'scalers': 'standard', 'encoders': 'Ordinal', 'n_estimators': 169, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:05:36,198] Trial 118 finished with value: 0.11214362819999532 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 231, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:07:54,809] Trial 119 finished with value: 0.07230339263733877 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 147, 'max_depth': 13, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:28:11,517] Trial 120 finished with value: 0.10735787388603349 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 254, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:30:47,274] Trial 121 finished with value: 0.11427617897590245 and parameters: {'scalers': 'standard', 'encoders': 'Ordinal', 'n_estimators': 169, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:33:34,681] Trial 122 finished with value: 0.11594359840666643 and parameters: {'scalers': 'standard', 'encoders': 'Ordinal', 'n_estimators': 188, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:36:10,024] Trial 123 finished with value: 0.12109964464864356 and parameters: {'scalers': 'standard', 'encoders': 'Ordinal', 'n_estimators': 174, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:39:38,317] Trial 124 finished with value: 0.11040808522510943 and parameters: {'scalers': 'standard', 'encoders': 'Ordinal', 'n_estimators': 209, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:42:54,766] Trial 125 finished with value: 0.110427518145084 and parameters: {'scalers': 'standard', 'encoders': 'Ordinal', 'n_estimators': 178, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:46:09,602] Trial 126 finished with value: 0.101777587071607 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 196, 'max_depth': 13, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:48:36,380] Trial 127 finished with value: 0.11775375970247492 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 141, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:50:12,171] Trial 128 finished with value: 0.03122633315037542 and parameters: {'scalers': 'standard', 'encoders': 'Ordinal', 'n_estimators': 159, 'max_depth': 10, 'k': 14}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:53:23,428] Trial 129 finished with value: 0.035433763892877806 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 350, 'max_depth': 12, 'k': 9}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:56:27,089] Trial 130 finished with value: 0.1275836506896249 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 217, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 15:59:51,473] Trial 131 finished with value: 0.10653437284654647 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 219, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:01:30,651] Trial 132 finished with value: 0.1189576734890186 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 104, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:04:07,156] Trial 133 finished with value: 0.11394654069300533 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 184, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:06:30,058] Trial 134 finished with value: 0.11381725929658837 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 200, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:08:34,855] Trial 135 finished with value: 0.11935744422493495 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 174, 'max_depth': 14, 'k': 22}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:11:04,484] Trial 136 finished with value: 0.11582429785213663 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 211, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:12:38,906] Trial 137 finished with value: 0.10201152158976798 and parameters: {'scalers': 'minmax', 'encoders': 'Ordinal', 'n_estimators': 120, 'max_depth': 13, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:15:21,828] Trial 138 finished with value: 0.12621337221862353 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 191, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:18:53,613] Trial 139 finished with value: 0.11727268624888224 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 266, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:21:26,851] Trial 140 finished with value: 0.11079780024896058 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 185, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:24:50,160] Trial 141 finished with value: 0.12401937265776723 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 193, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:28:27,877] Trial 142 finished with value: 0.11561764375988219 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 193, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:31:20,292] Trial 143 finished with value: 0.1187302574711124 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 204, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:34:21,916] Trial 144 finished with value: 0.11738368341401897 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 217, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:37:07,451] Trial 145 finished with value: 0.1227306696885065 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 187, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:39:41,739] Trial 146 finished with value: 0.11952480218727209 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 194, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:41:56,091] Trial 147 finished with value: 0.10334732159914817 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 180, 'max_depth': 13, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:46:34,342] Trial 148 finished with value: 0.11588462110532352 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 226, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:49:42,851] Trial 149 finished with value: 0.11582202141287565 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 208, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:54:32,957] Trial 150 finished with value: 0.09601221746585231 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 293, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 16:58:03,376] Trial 151 finished with value: 0.11908910977060812 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 189, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:00:59,469] Trial 152 finished with value: 0.11912953349924989 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 201, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:03:13,455] Trial 153 finished with value: 0.0690092972183959 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 166, 'max_depth': 14, 'k': 22}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:06:18,833] Trial 154 finished with value: 0.11629106050553292 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 153, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:09:39,926] Trial 155 finished with value: 0.12268906998122789 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 181, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:12:52,403] Trial 156 finished with value: 0.11765835801729611 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 182, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:15:19,387] Trial 157 finished with value: 0.10718826538573745 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 190, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:18:59,101] Trial 158 finished with value: 0.1194509297768515 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 198, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:23:13,414] Trial 159 finished with value: 0.09996540307825783 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 239, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:25:55,620] Trial 160 finished with value: 0.1205640657313856 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 164, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:28:30,963] Trial 161 finished with value: 0.12301347957839706 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 172, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:31:02,684] Trial 162 finished with value: 0.11770457113696797 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 177, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:32:17,764] Trial 163 finished with value: 0.02998119507443131 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 189, 'max_depth': 6, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:34:41,187] Trial 164 finished with value: 0.11608848085328809 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 180, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:36:44,742] Trial 165 finished with value: 0.11574245656561752 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 160, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:37:58,423] Trial 166 finished with value: 0.10883221760018506 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 92, 'max_depth': 13, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:40:31,545] Trial 167 finished with value: 0.11812864882724665 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 214, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:42:56,072] Trial 168 finished with value: 0.1297239668434765 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 197, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:43:42,400] Trial 169 finished with value: 0.0 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 198, 'max_depth': 3, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:46:01,898] Trial 170 finished with value: 0.10323149097105218 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 205, 'max_depth': 13, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:48:18,810] Trial 171 finished with value: 0.12479197225713187 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 174, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:50:27,337] Trial 172 finished with value: 0.11646612714385744 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 173, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:55:07,679] Trial 173 finished with value: 0.11925832131702205 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 316, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:58:41,155] Trial 174 finished with value: 0.11247884367186553 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 185, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 17:59:37,117] Trial 175 finished with value: 0.11510295183019688 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 36, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:03:03,047] Trial 176 finished with value: 0.12519956641381363 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 192, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:06:08,252] Trial 177 finished with value: 0.11066412299826311 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 193, 'max_depth': 14, 'k': 22}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:08:18,110] Trial 178 finished with value: 0.041123277762054974 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 174, 'max_depth': 14, 'k': 11}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:11:34,142] Trial 179 finished with value: 0.11926688992111083 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 185, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:14:18,699] Trial 180 finished with value: 0.10906888354155156 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 168, 'max_depth': 13, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:17:52,645] Trial 181 finished with value: 0.12125209983560636 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 194, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:18:57,801] Trial 182 finished with value: 0.09192243197837333 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 58, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:22:19,724] Trial 183 finished with value: 0.1213011289499829 and parameters: {'scalers': 'robust', 'encoders': 'Mean', 'n_estimators': 179, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:26:05,311] Trial 184 finished with value: 0.11350529794395148 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 201, 'max_depth': 14, 'k': 26}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:29:33,518] Trial 185 finished with value: 0.11406719190898418 and parameters: {'scalers': 'robust', 'encoders': 'Count', 'n_estimators': 210, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:33:02,064] Trial 186 finished with value: 0.11575743369790462 and parameters: {'scalers': 'minmax', 'encoders': 'Mean', 'n_estimators': 190, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:35:51,744] Trial 187 finished with value: 0.08520072171149733 and parameters: {'scalers': 'minmax', 'encoders': 'OneHot', 'n_estimators': 183, 'max_depth': 13, 'k': 22}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:38:52,449] Trial 188 finished with value: 0.05832946866713238 and parameters: {'scalers': 'minmax', 'encoders': 'PRatio', 'n_estimators': 204, 'max_depth': 14, 'k': 16}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:41:51,370] Trial 189 finished with value: 0.12635613530491174 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 175, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:44:38,143] Trial 190 finished with value: 0.11254736315732153 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 160, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:47:15,956] Trial 191 finished with value: 0.061651180660595495 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 175, 'max_depth': 14, 'k': 17}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:50:23,548] Trial 192 finished with value: 0.1027940495876023 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 188, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:53:38,086] Trial 193 finished with value: 0.12399675306758728 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 197, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:56:32,202] Trial 194 finished with value: 0.12106235623233294 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 169, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 18:59:50,254] Trial 195 finished with value: 0.1209109046409391 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 180, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 19:03:08,841] Trial 196 finished with value: 0.12458952911452267 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 197, 'max_depth': 14, 'k': 24}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 19:06:00,047] Trial 197 finished with value: 0.1224192266862942 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 197, 'max_depth': 14, 'k': 23}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 19:09:18,247] Trial 198 finished with value: 0.11963444291551546 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 214, 'max_depth': 14, 'k': 25}. Best is trial 97 with value: 0.13111670517989565. [I 2022-11-08 19:12:17,001] Trial 199 finished with value: 0.10323014536473296 and parameters: {'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 195, 'max_depth': 13, 'k': 25}. Best is trial 97 with value: 0.13111670517989565.
study_1.best_params
{'scalers': 'minmax', 'encoders': 'Count', 'n_estimators': 180, 'max_depth': 14, 'k': 24}
# Training the model with the found parameters
# Preprocessing steps for the numerical variables
num_preproc = Pipeline(steps=[('scaler',MinMaxScaler())])
# Preprocessing steps for the categorical variables
cat_preproc = Pipeline(steps=[('encoder', CountFrequencyEncoder())])
preprocessor = ColumnTransformer(transformers=[('numerical_preprocessor',num_preproc, num_ftr),
('cat_preprocessor', cat_preproc, cat_ftr)])
model_pipe = Pipeline(steps=[("preprocessing", preprocessor),
('selector', SelectKBest(mutual_info_classif, k=24)),
("model", RandomForestClassifier(n_estimators=180, max_depth=14, max_features=None))])
model_pipe.fit(train_X, train_y)
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('numerical_preprocessor', Pipeline(steps=[('scaler', MinMaxScaler())]), ['cons_12m', 'cons_gas_12m', 'forecast_cons_12m', 'forecast_discount_energy', 'forecast_meter_rent_12m', 'imp_cons', 'margin_gross_pow_ele', 'nb_prod_act', 'net_margin', 'pow_max', 'price_off_peak_var', 'price_off_peak_fix', 'p... 'ratio_last_month_last12m_cons']), ('cat_preprocessor', Pipeline(steps=[('encoder', CountFrequencyEncoder())]), ['has_gas', 'origin_up', 'price_change_energy', 'price_change_power'])])), ('selector', SelectKBest(k=24, score_func=<function mutual_info_classif at 0x000001FE454FD940>)), ('model', RandomForestClassifier(max_depth=14, max_features=None, n_estimators=180))])
# The classification report of the model
print(classification_report(test_y, model_pipe.predict(test_X)))
precision recall f1-score support 0 0.93 1.00 0.97 3876 1 0.99 0.36 0.53 426 accuracy 0.94 4302 macro avg 0.96 0.68 0.75 4302 weighted avg 0.94 0.94 0.92 4302
# Visualizing the confusion matrix
cfm=confusion_matrix(test_y, model_pipe.predict(test_X))
ConfusionMatrixDisplay(cfm).plot()
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1fe4e0ab400>
Given the information at hand, the model was able to predict 154 cases of churn out of 426 cases in the test set which represents 36% of the cases.\nWe think that the performance of the model is satisfactory since it's macro F1 score is larger than 0.7 which is a decent score in the case of a very imbalanced dataset.
# Getting the names of the selected features
# The selector object
selector = model_pipe[1]
# The indices of the features
indx = selector.get_support(indices=True)
select_ftr = df_full.columns[indx]
print(select_ftr)
Index(['has_gas', 'origin_up', 'price_change_energy', 'cons_12m', 'forecast_cons_12m', 'forecast_discount_energy', 'forecast_meter_rent_12m', 'imp_cons', 'margin_gross_pow_ele', 'nb_prod_act', 'net_margin', 'pow_max', 'price_off_peak_var', 'price_off_peak_fix', 'previous_price', 'price_sens', 'end_year', 'modif_prod_month', 'renewal_year', 'renewal_month', 'diff_act_end', 'diff_act_modif', 'diff_end_modif', 'ratio_last_month_last12m_cons'], dtype='object')
# The new dataframe with the selected features
final_df = df_full[select_ftr]
final_df
has_gas | origin_up | price_change_energy | cons_12m | forecast_cons_12m | forecast_discount_energy | forecast_meter_rent_12m | imp_cons | margin_gross_pow_ele | nb_prod_act | ... | previous_price | price_sens | end_year | modif_prod_month | renewal_year | renewal_month | diff_act_end | diff_act_modif | diff_end_modif | ratio_last_month_last12m_cons | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 4660 | 189.95 | 0.0 | 16.27 | 0.00 | 16.38 | 1 | ... | 0.078366 | 13.640956 | 2016 | 8 | 2015 | 8 | 2566 | 0 | 2566 | 0.000000 |
2 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 544 | 47.96 | 0.0 | 38.72 | 0.00 | 28.60 | 1 | ... | 0.129466 | 45.058910 | 2016 | 4 | 2015 | 4 | 2192 | 0 | 2192 | 0.000000 |
3 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 1584 | 240.04 | 0.0 | 19.83 | 0.00 | 30.22 | 1 | ... | 0.075605 | 28.408609 | 2016 | 3 | 2015 | 3 | 2192 | 0 | 2192 | 0.000000 |
4 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 4425 | 445.75 | 0.0 | 131.73 | 52.32 | 44.91 | 1 | ... | 0.113906 | 18.799168 | 2016 | 1 | 2015 | 3 | 2245 | 0 | 2245 | 0.118870 |
5 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 8302 | 796.94 | 0.0 | 30.12 | 181.21 | 33.12 | 1 | ... | 0.128293 | 40.841311 | 2016 | 11 | 2015 | 12 | 1827 | 1423 | 404 | 0.240665 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
14537 | t | lxidpiddsbxsbosboudacockeimpuepw | decrease | 32270 | 4648.01 | 0.0 | 18.57 | 0.00 | 27.88 | 2 | ... | 0.072062 | 21.198938 | 2016 | 5 | 2014 | 5 | 1445 | 1079 | 366 | 0.000000 |
14538 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 7223 | 631.69 | 0.0 | 144.03 | 15.94 | 0.00 | 1 | ... | 0.101103 | 18.186301 | 2016 | 8 | 2015 | 8 | 1461 | 0 | 1461 | 0.025059 |
14539 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 1844 | 190.39 | 0.0 | 129.60 | 18.05 | 39.84 | 1 | ... | 0.114066 | 18.237929 | 2016 | 2 | 2015 | 2 | 1460 | 0 | 1460 | 0.097072 |
14540 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 131 | 19.34 | 0.0 | 7.18 | 0.00 | 13.08 | 1 | ... | 0.078366 | 12.121174 | 2016 | 8 | 2015 | 8 | 1461 | 0 | 1461 | 0.000000 |
14541 | f | ldkssxwpmemidmecebumciepifcamkci | decrease | 8730 | 762.41 | 0.0 | 1.07 | 0.00 | 11.84 | 1 | ... | 0.128003 | 501.481008 | 2016 | 12 | 2015 | 12 | 2556 | 0 | 2556 | 0.000000 |
14340 rows × 24 columns
# Getting the trained model
clf = model_pipe[-1]
clf
RandomForestClassifier(max_depth=14, max_features=None, n_estimators=180)
# Saving the model
filename = 'best_model_PowerCo.sav'
pickle.dump(clf, open(filename, 'wb'))
# Saving the study
pickle.dump(study_1, open('optuna_study_PowerCo.sav', 'wb'))
# # Getting the names of the features after the second phase of feature selection
# Getting numerical features
num_ftr = [col for col in final_df.columns if final_df[col].dtype in ['int64', 'float64']]
# Getting categorical features
cat_ftr = [col for col in final_df.columns if col not in num_ftr + date_ftr and col != 'id']
print(len(num_ftr)+len(cat_ftr))
24
SHAP is a technique used to interpret machine learning models. It explains a prediction by calculating what are known as Shapley values, which were developed in Game Theory. These values represent the contributions of the different explanatory variables to the prediction in question.
# Getting the full dataframe with the selected features.
df_int = new_df_ft[select_ftr]
# Getting the train_X and test_X with the selected features
new_train_X = train_X[select_ftr]
new_test_X = test_X[select_ftr]
new_train_X
has_gas | origin_up | price_change_energy | cons_12m | forecast_cons_12m | forecast_discount_energy | forecast_meter_rent_12m | imp_cons | margin_gross_pow_ele | nb_prod_act | ... | previous_price | price_sens | end_year | modif_prod_month | renewal_year | renewal_month | diff_act_end | diff_act_modif | diff_end_modif | ratio_last_month_last12m_cons | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7835 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 62623 | 5636.69 | 0.0 | 129.60 | 405.53 | 13.76 | 1 | ... | 0.113567 | 18.190599 | 2016 | 1 | 2015 | 3 | 1461 | 1401 | 60 | 0.073535 |
7058 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 24673 | 3650.39 | 0.0 | 0.00 | 0.00 | 4.83 | 1 | ... | 0.073848 | 41.060672 | 2016 | 11 | 2015 | 12 | 2557 | 0 | 2557 | 0.000000 |
935 | f | ldkssxwpmemidmecebumciepifcamkci | decrease | 2339 | 227.88 | 0.0 | 131.77 | 21.80 | 22.44 | 1 | ... | 0.112147 | 17.713324 | 2016 | 7 | 2015 | 8 | 2557 | 2163 | 394 | 0.097050 |
11893 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 5973 | 388.97 | 0.0 | 0.00 | 67.84 | 24.62 | 1 | ... | 0.074933 | 24.044035 | 2016 | 11 | 2015 | 12 | 1461 | 1059 | 402 | 0.178302 |
3007 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 2059 | 260.77 | 0.0 | 14.34 | 18.33 | 14.53 | 1 | ... | 0.127125 | -2716.204844 | 2016 | 9 | 2015 | 9 | 1481 | 750 | 731 | 0.073822 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1034 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 1251 | 119.55 | 0.0 | 146.53 | 7.33 | 49.44 | 1 | ... | 0.114066 | 18.394221 | 2016 | 2 | 2015 | 2 | 1461 | 0 | 1461 | 0.062350 |
9554 | t | lxidpiddsbxsbosboudacockeimpuepw | decrease | 44785 | 4509.22 | 0.0 | 130.31 | 328.55 | 1.59 | 2 | ... | 0.112963 | 19.230034 | 2016 | 11 | 2015 | 12 | 2192 | 1437 | 755 | 0.076008 |
5079 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 3819 | 573.00 | 0.0 | 18.72 | 0.00 | 33.12 | 1 | ... | 0.075014 | 17.281933 | 2016 | 12 | 2015 | 2 | 2191 | 2144 | 47 | 0.000000 |
10947 | f | ldkssxwpmemidmecebumciepifcamkci | decrease | 42144 | 3860.98 | 0.0 | 131.37 | 44.32 | 21.16 | 1 | ... | 0.112523 | 18.525477 | 2016 | 6 | 2015 | 6 | 2557 | 0 | 2557 | 0.011698 |
6441 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 1225 | 179.65 | 0.0 | 16.56 | 28.85 | 12.36 | 1 | ... | 0.073711 | 21.852782 | 2016 | 6 | 2015 | 9 | 1826 | 1384 | 442 | 0.164082 |
10038 rows × 24 columns
# Scaling numerical features
scaler = SklearnTransformerWrapper(MinMaxScaler())
df_int = scaler.fit_transform(df_int)
df_int
has_gas | origin_up | price_change_energy | cons_12m | forecast_cons_12m | forecast_discount_energy | forecast_meter_rent_12m | imp_cons | margin_gross_pow_ele | nb_prod_act | ... | previous_price | price_sens | end_year | modif_prod_month | renewal_year | renewal_month | diff_act_end | diff_act_modif | diff_end_modif | ratio_last_month_last12m_cons | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 0.000751 | 0.002291 | 0.0 | 0.027148 | 0.000000 | 0.043722 | 0.000000 | ... | 0.351456 | 0.050352 | 0.0 | 0.636364 | 0.666667 | 0.636364 | 0.451526 | 0.012884 | 0.537402 | 0.000000 |
2 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 0.000087 | 0.000579 | 0.0 | 0.064608 | 0.000000 | 0.076340 | 0.000000 | ... | 0.580630 | 0.050598 | 0.0 | 0.272727 | 0.666667 | 0.272727 | 0.359498 | 0.012884 | 0.458148 | 0.000000 |
3 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 0.000255 | 0.002895 | 0.0 | 0.033088 | 0.000000 | 0.080664 | 0.000000 | ... | 0.339073 | 0.050468 | 0.0 | 0.181818 | 0.666667 | 0.181818 | 0.359498 | 0.012884 | 0.458148 | 0.000000 |
4 | f | kamkkxfxxuwbdslkwifmmcsiusiuosws | decrease | 0.000713 | 0.005377 | 0.0 | 0.219803 | 0.003478 | 0.119875 | 0.000000 | ... | 0.510846 | 0.050393 | 0.0 | 0.000000 | 0.666667 | 0.181818 | 0.372539 | 0.012884 | 0.469379 | 0.093135 |
5 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 0.001337 | 0.009613 | 0.0 | 0.050258 | 0.012046 | 0.088405 | 0.000000 | ... | 0.575366 | 0.050565 | 0.0 | 0.909091 | 0.666667 | 1.000000 | 0.269685 | 0.334539 | 0.079254 | 0.188562 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
14537 | t | lxidpiddsbxsbosboudacockeimpuepw | decrease | 0.005199 | 0.056066 | 0.0 | 0.030986 | 0.000000 | 0.074418 | 0.032258 | ... | 0.323184 | 0.050411 | 0.0 | 0.363636 | 0.333333 | 0.363636 | 0.175689 | 0.256781 | 0.071202 | 0.000000 |
14538 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 0.001164 | 0.007620 | 0.0 | 0.240326 | 0.001060 | 0.000000 | 0.000000 | ... | 0.453424 | 0.050388 | 0.0 | 0.636364 | 0.666667 | 0.636364 | 0.179626 | 0.012884 | 0.303242 | 0.019634 |
14539 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 0.000297 | 0.002297 | 0.0 | 0.216249 | 0.001200 | 0.106342 | 0.000000 | ... | 0.511562 | 0.050388 | 0.0 | 0.090909 | 0.666667 | 0.090909 | 0.179380 | 0.012884 | 0.303030 | 0.076056 |
14540 | f | lxidpiddsbxsbosboudacockeimpuepw | decrease | 0.000021 | 0.000233 | 0.0 | 0.011980 | 0.000000 | 0.034914 | 0.000000 | ... | 0.351456 | 0.050341 | 0.0 | 0.636364 | 0.666667 | 0.636364 | 0.179626 | 0.012884 | 0.303242 | 0.000000 |
14541 | f | ldkssxwpmemidmecebumciepifcamkci | decrease | 0.001406 | 0.009196 | 0.0 | 0.001785 | 0.000000 | 0.031604 | 0.000000 | ... | 0.574066 | 0.054158 | 0.0 | 1.000000 | 0.666667 | 1.000000 | 0.449065 | 0.012884 | 0.535283 | 0.000000 |
14340 rows × 24 columns
# Encoding categorical features
enc = CountFrequencyEncoder()
df_int = enc.fit_transform(df_int)
df_int
has_gas | origin_up | price_change_energy | cons_12m | forecast_cons_12m | forecast_discount_energy | forecast_meter_rent_12m | imp_cons | margin_gross_pow_ele | nb_prod_act | ... | previous_price | price_sens | end_year | modif_prod_month | renewal_year | renewal_month | diff_act_end | diff_act_modif | diff_end_modif | ratio_last_month_last12m_cons | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 11728 | 4272 | 13702 | 0.000751 | 0.002291 | 0.0 | 0.027148 | 0.000000 | 0.043722 | 0.000000 | ... | 0.351456 | 0.050352 | 0.0 | 0.636364 | 0.666667 | 0.636364 | 0.451526 | 0.012884 | 0.537402 | 0.000000 |
2 | 11728 | 4272 | 13702 | 0.000087 | 0.000579 | 0.0 | 0.064608 | 0.000000 | 0.076340 | 0.000000 | ... | 0.580630 | 0.050598 | 0.0 | 0.272727 | 0.666667 | 0.272727 | 0.359498 | 0.012884 | 0.458148 | 0.000000 |
3 | 11728 | 4272 | 13702 | 0.000255 | 0.002895 | 0.0 | 0.033088 | 0.000000 | 0.080664 | 0.000000 | ... | 0.339073 | 0.050468 | 0.0 | 0.181818 | 0.666667 | 0.181818 | 0.359498 | 0.012884 | 0.458148 | 0.000000 |
4 | 11728 | 4272 | 13702 | 0.000713 | 0.005377 | 0.0 | 0.219803 | 0.003478 | 0.119875 | 0.000000 | ... | 0.510846 | 0.050393 | 0.0 | 0.000000 | 0.666667 | 0.181818 | 0.372539 | 0.012884 | 0.469379 | 0.093135 |
5 | 11728 | 7009 | 13702 | 0.001337 | 0.009613 | 0.0 | 0.050258 | 0.012046 | 0.088405 | 0.000000 | ... | 0.575366 | 0.050565 | 0.0 | 0.909091 | 0.666667 | 1.000000 | 0.269685 | 0.334539 | 0.079254 | 0.188562 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
14537 | 2612 | 7009 | 13702 | 0.005199 | 0.056066 | 0.0 | 0.030986 | 0.000000 | 0.074418 | 0.032258 | ... | 0.323184 | 0.050411 | 0.0 | 0.363636 | 0.333333 | 0.363636 | 0.175689 | 0.256781 | 0.071202 | 0.000000 |
14538 | 11728 | 7009 | 13702 | 0.001164 | 0.007620 | 0.0 | 0.240326 | 0.001060 | 0.000000 | 0.000000 | ... | 0.453424 | 0.050388 | 0.0 | 0.636364 | 0.666667 | 0.636364 | 0.179626 | 0.012884 | 0.303242 | 0.019634 |
14539 | 11728 | 7009 | 13702 | 0.000297 | 0.002297 | 0.0 | 0.216249 | 0.001200 | 0.106342 | 0.000000 | ... | 0.511562 | 0.050388 | 0.0 | 0.090909 | 0.666667 | 0.090909 | 0.179380 | 0.012884 | 0.303030 | 0.076056 |
14540 | 11728 | 7009 | 13702 | 0.000021 | 0.000233 | 0.0 | 0.011980 | 0.000000 | 0.034914 | 0.000000 | ... | 0.351456 | 0.050341 | 0.0 | 0.636364 | 0.666667 | 0.636364 | 0.179626 | 0.012884 | 0.303242 | 0.000000 |
14541 | 11728 | 3056 | 13702 | 0.001406 | 0.009196 | 0.0 | 0.001785 | 0.000000 | 0.031604 | 0.000000 | ... | 0.574066 | 0.054158 | 0.0 | 1.000000 | 0.666667 | 1.000000 | 0.449065 | 0.012884 | 0.535283 | 0.000000 |
14340 rows × 24 columns
# Importing the classifier
model = pickle.load(open('best_model_PowerCo.sav', 'rb'))
model
RandomForestClassifier(max_depth=14, max_features=None, n_estimators=180)
# The code to get shap values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_int)
# Plotting feature importance
shap.summary_plot(shap_values[0], df_int, max_display=26, plot_type='bar')
We can see from the visual above that the three most important features for the model are "pow_max" which indicates the subscribed power, the net margin and "end_year" which indicates the registered year of the end of the contract.
# Investigating the global effect of each feature on the model output(pushing it down or up).
shap.summary_plot(shap_values[0], df_int, max_display=26)
From this visual we can conclude that:
High subscribed power increases the chances of the clients to churn (impacts negatively the output of the model, pushing it towards zero) this may be a signal that large clients are not satisfied with the prices that they are paying given the large quantities they are buying, which means that there are other companies giving them better prices. Another signal that the company is suffering from an exodus of it's big clients is that higher values of the feature forcast_cons_12m push the model to predict a churning client.
The churn is mainly driven by price related variables, since the decrease in the price energy during the previous year expressed using the feature "price_change_energy" decreases the chances of the clients leaving the company (this feature is categorical and was encoded using "counting" the category "decrease" has the highest value this why we see in the visual that large values of this feature impact the output of model negatively).
The clients that have a gas contract with the company are less likely to churn (according to the encoding of this feature "f" means high value and "t" means small value).
# The encoding of categorical features that may help us interpret the model.
enc.encoder_dict_
{'has_gas': {'f': 11728, 't': 2612}, 'origin_up': {'lxidpiddsbxsbosboudacockeimpuepw': 7009, 'kamkkxfxxuwbdslkwifmmcsiusiuosws': 4272, 'ldkssxwpmemidmecebumciepifcamkci': 3056, 'Rare': 3}, 'price_change_energy': {'decrease': 13702, 'increase': 625, 'stable': 13}}