#!/usr/bin/env python # coding: utf-8 # Michel Kiffel
# mkiffel@gmail.com
# This notebook supports [this blog article](https://mkffl.github.io/2022/03/02/Decisions-Part-3.html) # ## Objective # - Visual validation of ML model score calibration # - Calibration of log-likelihood ratios using logistic regression # - Validation using known score distributions to calculate the true ratios # - Visual inspection of calibration fit on a separate dataset of scores # # ## Logistic regression calibration # - Estimate # $$\text{llr}(x ; \omega) = \log \frac{ p(x \vert \omega_1)}{p(x \vert \omega_0)}$$ # - Using logistic regression, which by default estimates target class log-odds i.e. # $$\text{log-odds}(\omega; x) = \log \frac{ p(\omega_1 \vert x)}{p(\omega_0 \vert x)}$$ # # - Conversion from log-odds to llr uses the relation log-odds = llr + effective-prior (ep) hence llr = log-odds - ep # # ## Score distributions # - I test two assumptions for the score class-conditional density # - Gaussian: logistic regression calibration works well as expected # - Skew-normal: lack of fit # # Main source is *Tutorial on logistic-regression calibration and fusion: Converting a score to a likelihood ratio* by GS Morrison # # In[ ]: # Stats import numpy as np import seaborn as sns from scipy.special import logit, expit from scipy.stats import norm as f_norm, skewnorm # Off the shelf stats models from sklearn.linear_model import LogisticRegression as lr import pandas as pd import matplotlib.pyplot as plt from IPython.display import Image from IPython.core.display import HTML # In[ ]: # Utils def reshape_to_1d(array): """ Reshape [] to [[]], a requirement from sklearn. """ return np.reshape(array,(-1, 1)) def get_logistic_estimates(clf): """ Find the logistic reg parameter estimates """ return clf.intercept_[0], clf.coef_[0][0] def get_effective_prior(tarN, nontarN): return logit(tarN/(tarN+nontarN)) def logistic_logodds(x, β_0, β_1): return β_0 + β_1*x def log_likelihood_ratio_density(tar_density, nontar_density): def func(x): return np.log(tar_density(x)/nontar_density(x)) return func # Data def generate_data(nontar_rv, tar_rv, nontarN, tarN): """ Simulate the scores from two classes Args: nontar_rv (scipy.continuous_dist): rv for w0 class tar_rv (scipy.continuous_dist): rv for w1 class """ w0_sample = nontar_rv.rvs(nontarN) w1_sample = tar_rv.rvs(tarN) X = np.concatenate([w0_sample, w1_sample]) y = [0 for d in w0_sample] + [1 for d in w1_sample] return X, y # Validation def logistic_calibration_validation(x, y, test_x, nontarN, tarN, nontar_rv, tar_rv): """ Validate llr from logistic regression using the formula for two gaussians. Args: x (np.array): raw scores y (np.array): labels w0 or w1 test_x (np.array): validation dataset to visually inspect the calibrated scores nontar_rv (scipy.continuous_dist): rv for w0 tar_rv (scipy.continuous_dist): rv for w1 """ # True llr llr_density = log_likelihood_ratio_density(tar_rv.pdf, nontar_rv.pdf) llr_true = [llr_density(x) for x in test_x] # Estimated llr x_1d = reshape_to_1d(x) clf = lr(random_state=0).fit(x_1d, y) β_0, β_1 = get_logistic_estimates(clf) lo_preds = [logistic_logodds(x, β_0, β_1) for x in test_x] effective_prior = get_effective_prior(tarN, nontarN) llr_preds = [(lo - effective_prior) for lo in lo_preds] return llr_true, llr_preds def plot_true_and_predicted_llr(test_x, llr_true, llr_preds, title): ( sns.lineplot( 'x', 'value', hue='variable', data=( pd.DataFrame({ "x": test_x, "llr_true": llr_true, "llr_preds": llr_preds}) .melt("x") ) ) .set_title(title) ) # main def run(nontar_rv, tar_rv, nontarN, tarN, title): X, y = generate_data(nontar_rv, tar_rv, nontarN, tarN) sns.distplot(X[:nontarN]) sns.distplot(X[nontarN:]) plt.title("Class-conditional score distributions") plt.show() # Validation sample scores test_x = np.linspace(-1, 1, 15) #np.linspace(-3, 3, 40) llr_true, llr_preds = logistic_calibration_validation(X, y, test_x, nontarN, tarN, nontar_rv, tar_rv) plot_true_and_predicted_llr(test_x, llr_true, llr_preds, title) # In[ ]: nontarN = 1_000 tarN = 1_000 nontar_rv = f_norm(-3, 0.5) tar_rv = f_norm(-2, 0.5) run(nontar_rv, tar_rv, nontarN, tarN, "Balanced dataset with normally distributed scores") # In[ ]: tarN = 1_000 nontarN = int(7*tarN) nontar_rv = f_norm(-3, 0.5) tar_rv = f_norm(-2, 0.5) run(nontar_rv, tar_rv, nontarN, tarN, "Imbalanced dataset with normally distributed scores") # In[ ]: tarN = 1_000 nontarN = 1_000 nontar_rv = skewnorm(a=4, loc=-0.5, scale=0.5) tar_rv = skewnorm(a=-4, loc=0.5, scale=0.5) run(nontar_rv, tar_rv, nontarN, tarN, "Balanced dataset with skew-normally distributed scores")