In [ ]:

import sys
import warnings
# This notebook won't work on Python 2.x
if sys.version_info < (3, 0):
    warnings.warn("Notebook not executed - please use Python 3.x to run")
    exit(0)

Using Isotonic Regression to calibrate a classification model¶

In many classification use cases we are interested not only in predicting class labels but also in outputting probabilities that can be interpreted as confidence levels. In this notebook we will demonstrate how Isotonic Regression can be used to calibrate a GBM classifier.

We will show how the calibration method would look like in scikit-learn with use of CalibratedClassifierCV and how the same can be accomplished in H2O.

Please refer to https://scikit-learn.org/stable/modules/calibration.html for the theoretical background of calibrating probabilities.

In [1]:

import h2o

versionFromGradle='3.37.0',projectVersion='3.37.0.99999',branch='master',lastCommitHash='a1c95a407aec53a6cbc551484bd02d7d80b3bcb6',gitDescribe='jenkins-master-5950-dirty',compiledOn='2022-09-13 10:48:53',compiledBy='kurkami'

In [2]:

h2o.init(strict_version_check=False)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_342"; OpenJDK Runtime Environment (build 1.8.0_342-8u342-b07-0ubuntu1~22.04-b07); OpenJDK 64-Bit Server VM (build 25.342-b07, mixed mode)
  Starting server from /home/kurkami/git/h2o/h2o-3/build/h2o.jar
  Ice root: /tmp/tmp_k1gozye
  JVM stdout: /tmp/tmp_k1gozye/h2o_kurkami_started_from_python.out
  JVM stderr: /tmp/tmp_k1gozye/h2o_kurkami_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


H2O_cluster_uptime:	01 secs
H2O_cluster_timezone:	America/New_York
H2O_data_parsing_timezone:	UTC
H2O_cluster_version:	3.37.0.99999
H2O_cluster_version_age:	1 hour and 4 minutes
H2O_cluster_name:	H2O_from_python_kurkami_9vbn5o
H2O_cluster_total_nodes:	1
H2O_cluster_free_memory:	3.409 Gb
H2O_cluster_total_cores:	12
H2O_cluster_allowed_cores:	12
H2O_cluster_status:	locked, healthy
H2O_connection_url:	http://127.0.0.1:54321
H2O_connection_proxy:	{"http": null, "https": null}
H2O_internal_security:	False
Python_version:	3.10.4 final

Create synthetic data¶

In [3]:

from sklearn.datasets import make_classification
X, y = make_classification(n_samples=5000, n_features=2, n_redundant=0, random_state=42)
X_df = h2o.H2OFrame(X, column_names=["x1", "x2"])
y_df = h2o.H2OFrame(y, column_names=["y"]).asfactor()
df = X_df.cbind(y_df)
df

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%

Out[3]:

x1	x2	y
0.78399	0.399454	0
0.401748	-0.23744	0
-1.72528	-1.79556	0
1.34722	1.05784	1
-3.55901	-3.23764	0
0.575518	0.424405	1
-0.580976	0.639303	0
1.30574	-1.27541	1
-0.770629	-1.00661	0
-2.65608	-2.49828	0

[5000 rows x 3 columns]

Method 1: Use a separate set of observations for calibration¶

The simplest way of calibrating a classifier is to set aside a subset of the training set and use it for model calibration. In the code bellow we will split the dataset into training set and calibration set.

scikit-learn¶

In [4]:

# split data
from sklearn.model_selection import train_test_split
X_train, X_calib, y_train, y_calib = train_test_split(X, y, random_state=42)

In [5]:

# train a calibrated classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
base_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                      max_depth=1, random_state=0).fit(X_train, y_train)
calibrated_clf = CalibratedClassifierCV(base_estimator=base_clf, cv="prefit", method="isotonic")
calibrated_clf.fit(X_calib, y_calib)

Out[5]:

CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(learning_rate=1.0,
                                                                 max_depth=1,
                                                                 random_state=0),
                       cv='prefit', method='isotonic')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [6]:

# predict calibrated probabilities
calibrated_clf.predict_proba(X_calib)

Out[6]:

array([[1.        , 0.        ],
       [0.09793814, 0.90206186],
       [0.72972973, 0.27027027],
       ...,
       [0.25641026, 0.74358974],
       [0.95652174, 0.04347826],
       [0.02836879, 0.97163121]])

H2O¶

In [7]:

# split data
df_train, df_calib = df.split_frame(ratios=[.8], destination_frames=["df_train", "df_calib"], seed=42)

In [8]:

# train a calibrated classifier
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model = H2OGradientBoostingEstimator(
    calibrate_model=True, calibration_frame=df_calib, calibration_method="IsotonicRegression"
)
model.train(
    y="y", training_frame=df_train
)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%

Out[8]:

H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_model_python_1663084324496_1

Model Summary:
	number_of_trees	number_of_internal_trees	model_size_in_bytes	min_depth	max_depth	mean_depth	min_leaves	max_leaves	mean_leaves
	50.0	50.0	19176.0	5.0	5.0	5.0	18.0	32.0	25.82


[tips]
Use `model.show()` for more details.
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.

In [9]:

# calibrated probabilities are predicted alongside the original probabilities p0 vs cal_p0 (calibrated)
model.predict(df_calib)

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%

Out[9]:

predict	p0	p1	cal_p0	cal_p1
0	0.990326	0.00967415	1	0
0	0.964438	0.0355615	0.987288	0.0127119
0	0.975859	0.0241413	0.987288	0.0127119
0	0.96125	0.0387503	0.987288	0.0127119
0	0.990921	0.00907917	1	0
0	0.987689	0.0123109	0.987288	0.0127119
1	0.0255159	0.974484	0.0277778	0.972222
1	0.0317869	0.968213	0.0277778	0.972222
0	0.976713	0.0232871	0.987288	0.0127119
0	0.990921	0.00907917	1	0

[985 rows x 5 columns]

Method 2: Use CV holdout predictions to calibrate the classifier¶

In this method we use the full training set and cross-validation to get unbiased predictions. Then we train Isotonic Regression model on the CV holdout predictions. In H2O this is done by first training (and possibly tuning) the base classifier, training the Isotonic Regression model and injecting it into the original classifier.

scikit-learn¶

In [10]:

# train a calibrated classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
base_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                      max_depth=1, random_state=0)
calibrated_clf = CalibratedClassifierCV(base_estimator=base_clf, ensemble=False, cv=5, method="isotonic")
calibrated_clf.fit(X, y)

Out[10]:

CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(learning_rate=1.0,
                                                                 max_depth=1,
                                                                 random_state=0),
                       cv=5, ensemble=False, method='isotonic')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [11]:

len(calibrated_clf.calibrated_classifiers_)

Out[11]:

In [12]:

# predict calibrated probabilities
calibrated_clf.predict_proba(X)

Out[12]:

array([[0.25446429, 0.74553571],
       [0.68292683, 0.31707317],
       [0.95238095, 0.04761905],
       ...,
       [0.91246871, 0.08753129],
       [0.02298851, 0.97701149],
       [0.95238095, 0.04761905]])

h2o¶

In [13]:

# train a classifier using 5-fold CV, make sure you keep the CV holdout predictions
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model = H2OGradientBoostingEstimator(
    nfolds=5, keep_cross_validation_predictions=True
)
model.train(
    y="y", training_frame=df
)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%

Out[13]:

H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_model_python_1663084324496_53

Model Summary:
	number_of_trees	number_of_internal_trees	model_size_in_bytes	min_depth	max_depth	mean_depth	min_leaves	max_leaves	mean_leaves
	50.0	50.0	19611.0	5.0	5.0	5.0	16.0	32.0	26.52


[tips]
Use `model.show()` for more details.
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.

In [14]:

# CV holdout predictions will serve as the training frame for Isotonic Regression calibrator
xval_calib = model.cross_validation_holdout_predictions().cbind(df[["y"]])
xval_calib

Out[14]:

predict	p0	p1	y
0	0.51785	0.48215	0
0	0.896655	0.103345	0
0	0.988225	0.0117751	0
1	0.0468475	0.953152	1
0	0.988428	0.0115717	0
1	0.378415	0.621585	1
0	0.987002	0.0129979	0
1	0.0143761	0.985624	1
0	0.969988	0.0300118	0
0	0.988784	0.0112155	0

[5000 rows x 4 columns]

In [15]:

# train Isotonic Regression model with actual labels as the target and holdout p1 predictions as a (single) feature
from h2o.estimators.isotonicregression import H2OIsotonicRegressionEstimator
h2o_calibrator = H2OIsotonicRegressionEstimator()
h2o_calibrator.train(training_frame=xval_calib, x="p1", y="y")

isotonicregression Model Build progress: |███████████████████████████████████████| (done) 100%

Out[15]:

H2OIsotonicRegressionEstimator : Isotonic Regression
Model Key: IsotonicRegression_model_python_1663084324496_621

Isotonic Regression Model: summary
	number_of_observations	number_of_thresholds
	5000.0	60.0


[tips]
Use `model.show()` for more details.
Use `model.explain()` to inspect the model.
--
Use `h2o.display.toggle_user_tips()` to switch on/off this section.

In [16]:

# inject the calibrator model into the original GBM model
model.calibrate(h2o_calibrator)

In [17]:

# predict with calibrated probabilities
model.predict(df)

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%

Out[17]:

predict	p0	p1	cal_p0	cal_p1
0	0.569895	0.430105	0.609756	0.390244
0	0.908601	0.0913987	0.900958	0.0990415
0	0.987997	0.012003	0.997354	0.0026455
1	0.0576622	0.942338	0.06875	0.93125
0	0.988754	0.0112455	0.997354	0.0026455
1	0.374216	0.625784	0.352941	0.647059
0	0.980122	0.0198777	0.986861	0.0131387
1	0.0130121	0.986988	0.00689655	0.993103
0	0.949659	0.0503407	0.942308	0.0576923
0	0.988754	0.0112455	0.997354	0.0026455

[5000 rows x 5 columns]

In [ ]: