#!/usr/bin/env python # coding: utf-8 # # Exploring Quantile Regression as Classification # # The purpose of this notebook is to explore the possibility to fit conditiona # quantile estimators using classifiers on binned target values with # interpolation at predict-time. # In[1]: from scipy.interpolate import interp1d from sklearn.base import BaseEstimator, RegressorMixin, clone from sklearn.utils.validation import check_is_fitted from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import KBinsDiscretizer from sklearn.utils.validation import check_consistent_length from sklearn.utils import check_random_state import numpy as np class BinnedQuantileRegressor(BaseEstimator, RegressorMixin): def __init__( self, estimator=None, n_bins=100, quantile=0.5, binning_strategy="quantile", interpolation_knots="edges", interpolation_kind="linear", random_state=None, ): self.n_bins = n_bins self.estimator = estimator self.quantile = quantile self.binning_strategy = binning_strategy self.interpolation_knots = interpolation_knots self.interpolation_kind = interpolation_kind self.random_state = random_state def fit(self, X, y): assert self.interpolation_knots in ("edges", "midpoints") # Lightweight input validation: most of the input validation will be # handled by the sub estimators. random_state = check_random_state(self.random_state) check_consistent_length(X, y) self.target_binner_ = KBinsDiscretizer( n_bins=self.n_bins, strategy=self.binning_strategy, subsample=200_000, encode="ordinal", random_state=random_state, ) y_binned = self.target_binner_.fit_transform( np.asarray(y).reshape(-1, 1) ).ravel().astype(np.int32) # Fit the multiclass classifier to predict the binned targets from the # training set. if self.estimator is None: estimator = RandomForestClassifier(random_state=random_state) else: estimator = clone(self.estimator) self.estimator_ = estimator.fit(X, y_binned) return self def predict_quantiles(self, X, quantiles): check_is_fitted(self, "estimator_") edges = self.target_binner_.bin_edges_[0] n_bins = edges.shape[0] - 1 expected_shape = (X.shape[0], n_bins) y_proba_raw = self.estimator_.predict_proba(X) # Some might stay empty on the training set, in particular with the # uniform binning strategy. Typically, classifiers do not learn to # predict an explicit 0 probability for unobserved classes so we have # to post process their output: if y_proba_raw.shape != expected_shape: y_proba = np.zeros(shape=expected_shape) y_proba[:, self.estimator_.classes_] = y_proba_raw else: y_proba = y_proba_raw # Build the mapper for inverse CDF mapping, from cumulated # probabilities to continuous prediction. if self.interpolation_knots == "edges": y_cdf = np.zeros(shape=(X.shape[0], edges.shape[0])) y_cdf[:, 1:] = np.cumsum(y_proba, axis=1) return np.asarray( [ interp1d( y_cdf_i, edges, kind=self.interpolation_kind, )(quantiles) for y_cdf_i in y_cdf ] ) else: midpoints = (edges[1:] + edges[:-1]) / 2 y_cdf = np.cumsum(y_proba, axis=1) return np.asarray( [ interp1d( y_cdf_i, midpoints, kind=self.interpolation_kind, bounds_error=False, fill_value=(midpoints[0], midpoints[-1]), )(quantiles) for y_cdf_i in y_cdf ] ) def predict(self, X): return self.predict_quantiles(X, self.quantile).ravel() # Let's define a toy 1D dataset with heteroscedastic noise to compare different base classifiers: # In[2]: import matplotlib.pyplot as plt rng = np.random.RandomState(42) x = rng.uniform(-1, 1, size=10_000) X = x.reshape(-1, 1) y = np.sin(x * np.pi) + 1 y /= y.max() # Add heteroscedastic noise to make conditional quantiles interesting: y += rng.normal(0, y / 5, size=y.shape) _ = plt.plot(x, y, "o", alpha=0.1) # Let's also define some reusable helper functions to compare the models: # In[3]: from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer from sklearn.metrics import mean_pinball_loss from functools import partial def fit_and_plot(model, X, y, display_bin_edges=False): _ = plt.plot(X.ravel(), y, "o", alpha=0.1, markeredgewidth=0) X_test = np.linspace(X.min(), X.max(), 1000).reshape(-1, 1) q_pred = model.fit(X, y).predict(X_test) plt.plot(X_test, q_pred, label="q=0.95") if display_bin_edges: # Plot horizontal lines for each of the bin edges: bin_edges = model.target_binner_.bin_edges_[0] plt.hlines( bin_edges, X_test.min(), X_test.max(), color="k", linestyles="-", linewidths=0.2, alpha=0.2, ) plt.legend() plt.show() def evaluate(model, X, y, quantile=0.95): neg_q95_pinball_loss = make_scorer( partial(mean_pinball_loss, alpha=0.95), greater_is_better=False, ) cv_results = cross_validate( model, X, y, cv=5, scoring=neg_q95_pinball_loss, return_train_score=True, n_jobs=-1, ) train_scores = -cv_results["train_score"] val_scores = -cv_results["test_score"] fit_time = cv_results["fit_time"] print( f"Train pinball (q={quantile:.2f}): {train_scores.mean():.5f} +/- {train_scores.std():.5f}\n" f"Val pinball (q={quantile:.2f}): {val_scores.mean():.5f} +/- {val_scores.std():.5f}\n" f"Fit time: {fit_time.mean():.4f} +/- {fit_time.std():.4f} s" ) # ## RandomForestClassifier and the impact of the number of target bins # # This classifier can naturally handle a large number of classes (target value bins). It's a strong baseline as it is fast to fit and yields strong pinball loss values. # # Let's use it to explore the impact of the number of bins. # In[4]: base_classifier = RandomForestClassifier( random_state=42, n_estimators=30, min_samples_leaf=50, n_jobs=-1, ) binned_quantile_reg = BinnedQuantileRegressor( estimator=base_classifier, n_bins=10, # small value! quantile=0.95, random_state=42, ) evaluate(binned_quantile_reg, X, y) fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True) # Note that the interpolation makes it possible to predict values between # the edges of the bins. This is not possible with the classifier alone. # However we can see steps, so the model is probably underfitting because of # the small number of bins. Let's increase it: # In[5]: binned_quantile_reg.set_params(n_bins=100) evaluate(binned_quantile_reg, X, y) fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True) # We can increase it a lot more. Note that the resulting model is slower but # not overfitting more than the previous one: # In[6]: binned_quantile_reg.set_params(n_bins=1000) evaluate(binned_quantile_reg, X, y) fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=False) # For the rest of the notebook, we will keep the number of bins to 100: # In[7]: n_bins = 100 _ = binned_quantile_reg.set_params(n_bins=n_bins) # ## MLPClassifier on spline numerical features # # Much slower (100x) on CPU but better slightly better results. This is the # only method that yields a smooth (and non-oscillating) prediction function: # In[8]: from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import SplineTransformer base_classifier = make_pipeline( SplineTransformer(n_knots=5, degree=3), MLPClassifier( random_state=42, hidden_layer_sizes=[100, 100], solver="adam", max_iter=300, ), ) binned_quantile_reg.set_params(estimator=base_classifier) evaluate(binned_quantile_reg, X, y) fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True) # ## XGBClassifier with vector leafs # # Using the XGBoost classifier as a base estimator gives good enough results # but it typically slower and not better than RF with tuned `min_samples_leaf`. # # The story might be different on datasets with a very large number of samples. # # Probably requires more expensive hyperparameter tuning to shine. # In[9]: from sympy import N from xgboost import XGBClassifier base_classifier = XGBClassifier( tree_method="hist", multi_strategy="multi_output_tree", random_state=42, max_leaves=10, n_estimators=100, learning_rate=0.3, ) binned_quantile_reg.set_params(estimator=base_classifier) evaluate(binned_quantile_reg, X, y) fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True) # ## HistGradientBoostingClassifier # # **XXX** There is a problem with this one. It does not work at all I have not found the cause yet, even when tweaking the hparams: # In[10]: from sklearn.ensemble import HistGradientBoostingClassifier base_classifier = HistGradientBoostingClassifier( random_state=42, max_leaf_nodes=10, early_stopping=True, n_iter_no_change=3, max_iter=1_000, learning_rate=0.3, ) binned_quantile_reg.set_params(estimator=base_classifier) evaluate(binned_quantile_reg, X, y) fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True) # ## Comparing to traditional quantile regressor # In[11]: from sklearn.ensemble import HistGradientBoostingRegressor quantile_regressor = HistGradientBoostingRegressor( random_state=42, loss="quantile", quantile=0.95, max_leaf_nodes=10, early_stopping=True, n_iter_no_change=3, max_iter=1_000, learning_rate=0.3, ) evaluate(quantile_regressor, X, y) fit_and_plot(quantile_regressor, X, y) # In[12]: from xgboost import XGBRegressor quantile_regressor = XGBRegressor( random_state=42, tree_method="hist", objective="reg:quantileerror", quantile_alpha=0.95, max_leaves=10, n_estimators=100, learning_rate=0.3, ) evaluate(quantile_regressor, X, y) fit_and_plot(quantile_regressor, X, y)