Last Update: 08/02/2020
PyCaret Version: 2.0
Author: Alexandre Farias
Email: afarias@tuta.io
Image Source: What Is Churn Rate and Why It Will Mess With Your Growth
Customer Churn is when customers leave a service in a given period of time, which is bad for business.
This work has as objective to build a machine learning model to predict which customers will leave the service, the dataset used on this notebook is the Telco Customer Churn hosted at Kaggle. Also, an Exploratory Data Analysis is made to a better understand about the data.
Another point on this work is use Deepnote as development enviroment and the PyCaret 2.0 Python Module to make all the experiment pipeline.
# Standard
import pandas as pd
import numpy as np
import os
# Pycaret
from pycaret.classification import *
# Plots
from plotly.offline import iplot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
# Sklearn tools
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
# Extras
from datetime import date
import warnings
warnings.filterwarnings("ignore")
# Datapath and Setup
os.chdir("..")
PATH = os.getcwd()+os.sep
RANDOM_SEED = 142
K_FOLDS = 5
And the helper functions used on this notebook.
# Helper functions for structured data
# Get info about the dataset
def dataset_info(dataset, dataset_name: str):
print(f"Dataset Name: {dataset_name} \
| Number of Samples: {dataset.shape[0]} \
| Number of Columns: {dataset.shape[1]}")
print(30*"=")
print("Column Data Type")
print(dataset.dtypes)
print(30*"=")
missing_data = dataset.isnull().sum()
if sum(missing_data) > 0:
print(missing_data[missing_data.values > 0])
else:
print("No Missing Data on this Dataset!")
print(30*"=")
print("Memory Usage: {} MB".\
format(np.round(
dataset.memory_usage(index=True).sum() / 10e5, 3
)))
# Dataset Sampling
def data_sampling(dataset, frac: float, random_seed: int):
data_sampled_a = dataset.sample(frac=frac, random_state=random_seed)
data_sampled_b = dataset.drop(data_sampled_a.index).reset_index(drop=True)
data_sampled_a.reset_index(drop=True, inplace=True)
return data_sampled_a, data_sampled_b
# Bar Plot
def bar_plot(data, plot_title: str, x_axis: str, y_axis: str):
colors = ["#0080ff",] * len(data)
colors[0] = "#ff8000"
trace = go.Bar(y=data.values, x=data.index, text=data.values,
marker_color=colors)
layout = go.Layout(autosize=False, height=600,
title={"text" : plot_title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"},
xaxis={"title" : x_axis},
yaxis={"title" : y_axis},)
fig = go.Figure(data=trace, layout=layout)
fig.update_layout(template="simple_white")
fig.update_traces(textposition="outside",
textfont_size=14,
marker=dict(line=dict(color="#000000", width=2)))
fig.update_yaxes(automargin=True)
iplot(fig)
# Plot Pie Chart
def pie_plot(data, plot_title: str):
trace = go.Pie(labels=data.index, values=data.values)
layout = go.Layout(autosize=False,
title={"text" : plot_title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"})
fig = go.Figure(data=trace, layout=layout)
fig.update_traces(textfont_size=14,
marker=dict(line=dict(color="#000000", width=2)))
fig.update_yaxes(automargin=True)
iplot(fig)
# Histogram
def histogram_plot(data, plot_title: str, y_axis: str):
trace = go.Histogram(x=data)
layout = go.Layout(autosize=False,
title={"text" : plot_title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"},
yaxis={"title" : y_axis})
fig = go.Figure(data=trace, layout=layout)
fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
fig.update_layout(template="simple_white")
fig.update_yaxes(automargin=True)
iplot(fig)
# Particular case: Histogram subplot (1, 2)
def histogram_subplot(dataset_a, dataset_b, feature_a: str,
feature_b: str, title: str, title_a: str, title_b: str):
fig = make_subplots(rows=1, cols=2, subplot_titles=(
title_a,
title_b
)
)
fig.add_trace(go.Histogram(x=dataset_a[feature_a],
showlegend=False),
row=1, col=1)
fig.add_trace(go.Histogram(x=dataset_b[feature_b],
showlegend=False),
row=1, col=2)
fig.update_layout(template="simple_white")
fig.update_layout(autosize=False,
title={"text" : title,
"y" : 0.9,
"x" : 0.5,
"xanchor" : "center",
"yanchor" : "top"},
yaxis={"title" : "<i>Frequency</i>"})
fig.update_traces(marker=dict(line=dict(color="#000000", width=2)))
fig.update_yaxes(automargin=True)
iplot(fig)
# Calculate scores with Test/Unseen labeled data
def test_score_report(data_unseen, predict_unseen):
le = LabelEncoder()
data_unseen["Label"] = le.fit_transform(data_unseen.Churn.values)
data_unseen["Label"] = data_unseen["Label"].astype(int)
accuracy = accuracy_score(data_unseen["Label"], predict_unseen["Label"])
roc_auc = roc_auc_score(data_unseen["Label"], predict_unseen["Label"])
precision = precision_score(data_unseen["Label"], predict_unseen["Label"])
recall = recall_score(data_unseen["Label"], predict_unseen["Label"])
f1 = f1_score(data_unseen["Label"], predict_unseen["Label"])
df_unseen = pd.DataFrame({
"Accuracy" : [accuracy],
"AUC" : [roc_auc],
"Recall" : [recall],
"Precision" : [precision],
"F1 Score" : [f1]
})
return df_unseen
# Confusion Matrix
def conf_mat(data_unseen, predict_unseen):
unique_label = data_unseen["Label"].unique()
cmtx = pd.DataFrame(
confusion_matrix(data_unseen["Label"],
predict_unseen["Label"],
labels=unique_label),
index=['{:}'.format(x) for x in unique_label],
columns=['{:}'.format(x) for x in unique_label]
)
ax = sns.heatmap(cmtx, annot=True, fmt="d", cmap="YlGnBu")
ax.set_ylabel('Predicted')
ax.set_xlabel('Target');
ax.set_title("Predict Unseen Confusion Matrix", size=14);
The Dataset is load as a Pandas dataframe and show a gimplse of the data. A good thing about Deepnote is that the displayed dataframes shows the column type, helping to understand the features.
dataset = pd.read_csv(PATH+"data"+os.sep+"customers.csv")
dataset.head(3)
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 rows × 21 columns
Check for duplicated samples.
dataset[dataset.duplicated()]
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn |
---|
0 rows × 21 columns
There are no duplicated samples on the dataset.
More information about the dataset is needed as the number of samples, memory size allocation, etc.
The result is showed on the following output (The Data Type is just showed for convenience, to make this notebook useful on other enviroments).
dataset_info(dataset, "customers")
Dataset Name: customers | Number of Samples: 7043 | Number of Columns: 21 ============================== Column Data Type customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object ============================== No Missing Data on this Dataset! ============================== Memory Usage: 1.183 MB
The dataset has a small memory size allocation (1.183 MB) and is composed for many Categorical (object) features and only a few numeric, but one of the categorical features doesn't look right, the TotalCharges
, as showed on the displayed dataframe, the festure is numeric.
TotalCharges
is converted from Object to float64, the same of MonthlyCharges
feature.
dataset["TotalCharges"] = pd.to_numeric(dataset["TotalCharges"], errors="coerce")
print(f"The Feature TotalCharges is type {dataset.TotalCharges.dtype} now!")
The Feature TotalCharges is type float64 now!
The Client Churn Distribution is checked for any imbalance, as the feature is the target, it's important to choose what strategy to adopt when dealing with imbalanced classes.
Below, a Pie Chart shows the feature distribution.
pie_plot(dataset["Churn"].value_counts(),
plot_title="<b>Client Churn Distribution<b>")
There's some imbalance on Churn Distribution, 26.5% of the clients have churned, and small occurences of a label could lead to bad predictor.
It's possible to choose some ways to work with this case:
The Churn problem is about client retention, is worth to check about false positives, so precision and recall metrics are a must for this situtation.
F1 Score is used to check the quality of the model predictions, as the metric is an harmonic mean of precision and recall.
The contract type is a good feature to analyze what happens to a client churn from that service, a plot from the contract types of not churned clients is showed below.
df_aux = dataset.query('Churn == "No"')
df_aux = df_aux["Contract"].value_counts()
bar_plot(df_aux, "<b>Contract Types of not Churned Clients</b>",
"<i>Contract</i>", "<i>Count</i>")
Is showed that a Month-to-month contract is the firts when compared to annual contracts, but the difference between the number of contracts is not so big.
To a better comparation, the same plot is showed for the churned clients.
df_aux = dataset.query('Churn == "Yes"')
df_aux = df_aux["Contract"].value_counts()
bar_plot(df_aux, "<b>Contract Types of Churned Clients</b>",
"<i>Contract</i>", "<i>Count</i>")
Now, the difference between a Month-to-month and annual contractts is bigger, and can lead to a conclusion that annual contracts are better to retain the clients, perhaps fidelity promotions could aid to reduce the churn rate.
As the problem can be examined more deep on Month-to-month contract types, a good idea is see the Monthly Charges and Total Charges distribution for the not churned clients of this contract.
df_aux = dataset.query('(Contract == "Month-to-month") and (Churn == "No")')
histogram_subplot(df_aux, df_aux, "MonthlyCharges", "TotalCharges",
"<b>Charges Distribution for Month-to-month contracts for not Churned Clients</b>",
"(a) Monthtly Charges Distribution", "(b) Total Charges Distribution")
From the plots, can be said that many clients just got charged with a few values, principally for the Total Charges.
On the following plots, the same features are analyzed, but for churned clients.
df_aux = dataset.query('(Contract == "Month-to-month") and (Churn == "Yes")')
histogram_subplot(df_aux, df_aux, "MonthlyCharges", "TotalCharges",
"<b>Charges Distribution for Month-to-month contracts for Churned Clients</b>",
"(a) Monthtly Charges Distribution", "(b) Total Charges Distribution")
Total Charges had the same behaviour, but the Monthly Charges for many churned clients was high, maybe the amount of chage value could lead the client to leave the service.
Still on the Month-to-month contract, it's time to analyze the most used Payment methods of churned clients.
df_aux = dataset.query(('Contract == Month-to-month') and ('Churn == "Yes"'))
df_aux = df_aux["PaymentMethod"].value_counts()
bar_plot(df_aux, "<b>Payment Method of Month-to-month contract Churned Clients</b>",
"<i>Payment Method</i>", "<i>Count</i>")
Many Churned Clients used to pay with electronic checks, automatic payments, as bank transfers or credit card have a few churned clients. A good idea could make promotions to clients that use automatic payment methods.
Lastly, the tenure of the churned clients.
df_aux = dataset.query(('Contract == Month-to-month') and ('Churn == "Yes"'))
df_aux = df_aux["tenure"].value_counts().head(5)
bar_plot(df_aux, "<b>Tenure of Month-to-month contract for Churned Clients</b>",
"<i>Tenure</i>", "<i>Count</i>")
Most clients just used the service for one month, seems like the clients used to service to check the quality or the couldn't stay for the amount of charges, as the Monthly Charges for these clients was high and the Total Charges was small, as the client just stayed a little time.
Before setting up PyCaret, a random sample of 10% size of the dataset will be get to make predictions with unseen data.
data, data_unseen = data_sampling(dataset, 0.9, RANDOM_SEED)
print(f"There are {data_unseen.shape[0]} samples for Unseen Data.")
There are 704 samples for Unseen Data.
The PyCaret's setup is made with 90% of data samples and just use one function (setup
) from the module.
It's possible configure with variuos options, as data pre-processing, feature engineering, etc. The easy and efficient of PyCaret buy a lot of time when prototyping models.
Each setup is an experiment and for this problem, is used the following options:
MonthlyCharges
and TotalCharges
.exp01 = setup(data=data, target="Churn", session_id=RANDOM_SEED, ignore_features=["customerID"],
numeric_features=["SeniorCitizen"], normalize=True,
feature_selection=True, remove_outliers=True,
remove_multicollinearity=True, fix_imbalance=True,
transformation=True, ignore_low_variance=True, pca=True,
bin_numeric_features=["MonthlyCharges", "TotalCharges"],
silent=True, experiment_name="customer-churn-prediction",
log_experiment=True)
Setup Succesfully Completed!
Description | Value | |
---|---|---|
0 | session_id | 142 |
1 | Target Type | Binary |
2 | Label Encoded | No: 0, Yes: 1 |
3 | Original Data | (6339, 21) |
4 | Missing Values | True |
5 | Numeric Features | 4 |
6 | Categorical Features | 16 |
7 | Ordinal Features | False |
8 | High Cardinality Features | False |
9 | High Cardinality Method | None |
10 | Sampled Data | (6022, 21) |
11 | Transformed Train Set | (4215, 40) |
12 | Transformed Test Set | (1807, 40) |
13 | Numeric Imputer | mean |
14 | Categorical Imputer | constant |
15 | Normalize | True |
16 | Normalize Method | zscore |
17 | Transformation | True |
18 | Transformation Method | yeo-johnson |
19 | PCA | True |
20 | PCA Method | linear |
21 | PCA Components | 0.990000 |
22 | Ignore Low Variance | True |
23 | Combine Rare Levels | False |
24 | Rare Level Threshold | None |
25 | Numeric Binning | True |
26 | Remove Outliers | True |
27 | Outliers Threshold | 0.050000 |
28 | Remove Multicollinearity | True |
29 | Multicollinearity Threshold | 0.900000 |
30 | Clustering | False |
31 | Clustering Iteration | None |
32 | Polynomial Features | False |
33 | Polynomial Degree | None |
34 | Trignometry Features | False |
35 | Polynomial Threshold | None |
36 | Group Features | False |
37 | Feature Selection | True |
38 | Features Selection Threshold | 0.800000 |
39 | Feature Interaction | False |
40 | Feature Ratio | False |
41 | Interaction Threshold | None |
42 | Fix Imbalance | True |
43 | Fix Imbalance Method | SMOTE |
PyCaret shows at first if all features types are with it correspondent type, if everything is right, press enter on the blank bar and the setup is finished showing a summary of the experiment.
A great tool on PyCaret is build many models and compare a metric for the bests!
The models are sorted by F1 Score due Precision and Recall are importants for the evaluation.
The cross-validation is made with 5-folds.
top_model = compare_models(fold=K_FOLDS,
sort="F1",
n_select=1,
blacklist=["gbc", "catboost"])
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
0 | Logistic Regression | 0.7516 | 0.8373 | 0.7777 | 0.5349 | 0.6336 | 0.4553 | 0.4739 | 0.0468 |
1 | Ridge Classifier | 0.7416 | 0.0000 | 0.7897 | 0.5216 | 0.6279 | 0.4424 | 0.4653 | 0.0136 |
2 | Linear Discriminant Analysis | 0.7414 | 0.8358 | 0.7897 | 0.5213 | 0.6277 | 0.4421 | 0.4650 | 0.0485 |
3 | SVM - Linear Kernel | 0.7419 | 0.0000 | 0.7356 | 0.5253 | 0.6115 | 0.4267 | 0.4417 | 0.0532 |
4 | Ada Boost Classifier | 0.7464 | 0.8042 | 0.7004 | 0.5313 | 0.6040 | 0.4226 | 0.4316 | 1.3652 |
5 | Quadratic Discriminant Analysis | 0.7414 | 0.8139 | 0.7082 | 0.5235 | 0.6015 | 0.4164 | 0.4274 | 0.0310 |
6 | Light Gradient Boosting Machine | 0.7692 | 0.8094 | 0.6266 | 0.5772 | 0.6000 | 0.4384 | 0.4398 | 0.9825 |
7 | K Neighbors Classifier | 0.6837 | 0.7636 | 0.7614 | 0.4568 | 0.5709 | 0.3445 | 0.3728 | 0.0920 |
8 | Extreme Gradient Boosting | 0.7599 | 0.7934 | 0.5760 | 0.5646 | 0.5696 | 0.4033 | 0.4038 | 2.7836 |
9 | Extra Trees Classifier | 0.7656 | 0.7878 | 0.5382 | 0.5830 | 0.5593 | 0.4001 | 0.4009 | 0.5752 |
10 | Random Forest Classifier | 0.7599 | 0.7806 | 0.5391 | 0.5701 | 0.5541 | 0.3900 | 0.3903 | 0.2290 |
11 | Naive Bayes | 0.7070 | 0.7421 | 0.6506 | 0.4779 | 0.5510 | 0.3410 | 0.3500 | 0.0137 |
12 | Decision Tree Classifier | 0.6977 | 0.6559 | 0.5459 | 0.4605 | 0.4994 | 0.2852 | 0.2875 | 0.4045 |
The best model suggested by PyCaret is the Logistic Regreesion, with a F1 Score around 0.63 and a good Recall, around 0.77.
Time to tune the model. The choose_better
argument get the best model between the tuned and best model.
tuned_model = tune_model(estimator=top_model, fold=K_FOLDS,
optimize="F1", choose_better=True,
verbose=False)
Let's see the hyperparameters of the chosen model
plot_model(tuned_model, plot="parameter")
Parameters | |
---|---|
C | 8.261 |
class_weight | balanced |
dual | False |
fit_intercept | True |
intercept_scaling | 1 |
l1_ratio | None |
max_iter | 100 |
multi_class | auto |
n_jobs | -1 |
penalty | l2 |
random_state | 142 |
solver | lbfgs |
tol | 0.0001 |
verbose | 0 |
warm_start | False |
PyCaret also has functions to make ensembles, for this implementation, a bagged model is build.
bagged_model = ensemble_model(tuned_model, fold=K_FOLDS)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7711 | 0.8668 | 0.8455 | 0.5565 | 0.6712 | 0.5068 | 0.5329 |
1 | 0.7248 | 0.8332 | 0.7682 | 0.5014 | 0.6068 | 0.4092 | 0.4312 |
2 | 0.7450 | 0.8274 | 0.7253 | 0.5281 | 0.6112 | 0.4284 | 0.4403 |
3 | 0.7473 | 0.8326 | 0.7468 | 0.5305 | 0.6203 | 0.4390 | 0.4534 |
4 | 0.7461 | 0.8265 | 0.7768 | 0.5277 | 0.6285 | 0.4462 | 0.4654 |
Mean | 0.7469 | 0.8373 | 0.7725 | 0.5288 | 0.6276 | 0.4459 | 0.4647 |
SD | 0.0147 | 0.0150 | 0.0406 | 0.0174 | 0.0231 | 0.0329 | 0.0360 |
And now a boosted model.
boosted_model = ensemble_model(tuned_model, fold=K_FOLDS,
method="Boosting")
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7746 | 0.8697 | 0.8541 | 0.5606 | 0.6769 | 0.5150 | 0.5420 |
1 | 0.7414 | 0.8365 | 0.7897 | 0.5212 | 0.6280 | 0.4423 | 0.4647 |
2 | 0.7497 | 0.8301 | 0.7253 | 0.5348 | 0.6157 | 0.4363 | 0.4474 |
3 | 0.7544 | 0.8319 | 0.7554 | 0.5399 | 0.6297 | 0.4535 | 0.4678 |
4 | 0.7509 | 0.8261 | 0.7854 | 0.5335 | 0.6354 | 0.4565 | 0.4762 |
Mean | 0.7542 | 0.8389 | 0.7820 | 0.5380 | 0.6371 | 0.4607 | 0.4796 |
SD | 0.0111 | 0.0158 | 0.0429 | 0.0128 | 0.0209 | 0.0281 | 0.0325 |
The boosted model improved a bit the F1 Score, it's also possible make blended and stacked models with PyCaret, both models are created using the the tuned and boosted models.
blended_model = blend_models(estimator_list=[tuned_model, boosted_model],
fold=K_FOLDS)
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.7770 | 0.0000 | 0.8412 | 0.5648 | 0.6759 | 0.5157 | 0.5395 |
1 | 0.7414 | 0.0000 | 0.7682 | 0.5219 | 0.6215 | 0.4358 | 0.4546 |
2 | 0.7509 | 0.0000 | 0.7124 | 0.5372 | 0.6125 | 0.4342 | 0.4436 |
3 | 0.7556 | 0.0000 | 0.7468 | 0.5421 | 0.6282 | 0.4529 | 0.4658 |
4 | 0.7580 | 0.0000 | 0.7811 | 0.5433 | 0.6408 | 0.4671 | 0.4846 |
Mean | 0.7566 | 0.0000 | 0.7700 | 0.5419 | 0.6358 | 0.4612 | 0.4776 |
SD | 0.0117 | 0.0000 | 0.0425 | 0.0138 | 0.0221 | 0.0298 | 0.0338 |
The best model still is the boosted model.
Let's plot some metric curves, matrices and see what is the model classifier, starting with the hyperparameters and the used model classifier.
best_model = boosted_model
plot_model(best_model, plot="parameter")
print(f"Model: {type(best_model)}")
Parameters | |
---|---|
algorithm | SAMME.R |
base_estimator__C | 8.261 |
base_estimator__class_weight | balanced |
base_estimator__dual | False |
base_estimator__fit_intercept | True |
base_estimator__intercept_scaling | 1 |
base_estimator__l1_ratio | None |
base_estimator__max_iter | 100 |
base_estimator__multi_class | auto |
base_estimator__n_jobs | -1 |
base_estimator__penalty | l2 |
base_estimator__random_state | 142 |
base_estimator__solver | lbfgs |
base_estimator__tol | 0.0001 |
base_estimator__verbose | 0 |
base_estimator__warm_start | False |
base_estimator | LogisticRegression(C=8.261000000000001, class_... |
learning_rate | 1 |
n_estimators | 10 |
random_state | 142 |
Model: <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
Let's plot the ROC curve, PR Curve, Confusion Matrix and Metrics for each class.
plot_model(best_model, plot="auc")
The AUC for each class was good: 0.85.
plot_model(best_model, plot="pr")
The PR curve got an average precision around 0.7, which is good.
plot_model(best_model, plot="confusion_matrix")
The Confusion Matrix shows that the churned clients have been classified as not churn in 50% of the predictions
plot_model(best_model, plot="class_report")
The model has done a good work on the metrics for the class 0 (Not Churned) but got a Precision close to 0.5 for class 1 (Churned).
The test is made with the remaining 30% of data that PyCaret got on the setup, it's important to see that the model is not overfitting.
predict_model(best_model);
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Ada Boost Classifier | 0.7565 | 0.8537 | 0.772 | 0.5421 | 0.637 | 0.4621 | 0.4784 |
As everything is right with the model, it's time to finalize it fitting all the data.
final_model = finalize_model(best_model)
The remaining 10% data is used to make predictions with unseen samples, what could include some outliers, it's how real world data works.
Just Kappa Score is not showed, as the focus is the F1 Score, as Precision and Recall are importants to get False Positives and False Negatives.
It's not necessary to make any transformation on the data, PyCaret do this.
predict_unseen = predict_model(final_model, data=data_unseen);
score_unseen = test_score_report(data_unseen, predict_unseen)
print(score_unseen.to_string(index=False))
conf_mat(data_unseen, predict_unseen)
Accuracy AUC Recall Precision F1 Score 0.767045 0.789801 0.833333 0.507246 0.630631
And the Unseen Data predicts as the trained model! The model was sucessful built!
PyCaret allows to save all the pipeline experiment and the model to deploy.
It's recommended to save with date of the experiments.
save_model(final_model, PATH+"models"+os.sep+"modelCCP_"+date.today().strftime("%m-%d-%Y"))
Transformation Pipeline and Model Succesfully Saved
From the results and explanations presented here, some conclusion can be draw:
From the tools and enviroment used: