#!/usr/bin/env python
# coding: utf-8
# Michel Kiffel
# mkiffel@gmail.com
# This notebook supports [this blog article](https://mkffl.github.io/2022/03/02/Decisions-Part-3.html)
# ## Objective
# - Visual validation of ML model score calibration
# - Calibration of log-likelihood ratios using logistic regression
# - Validation using known score distributions to calculate the true ratios
# - Visual inspection of calibration fit on a separate dataset of scores
#
# ## Logistic regression calibration
# - Estimate
# $$\text{llr}(x ; \omega) = \log \frac{ p(x \vert \omega_1)}{p(x \vert \omega_0)}$$
# - Using logistic regression, which by default estimates target class log-odds i.e.
# $$\text{log-odds}(\omega; x) = \log \frac{ p(\omega_1 \vert x)}{p(\omega_0 \vert x)}$$
#
# - Conversion from log-odds to llr uses the relation log-odds = llr + effective-prior (ep) hence llr = log-odds - ep
#
# ## Score distributions
# - I test two assumptions for the score class-conditional density
# - Gaussian: logistic regression calibration works well as expected
# - Skew-normal: lack of fit
#
# Main source is *Tutorial on logistic-regression calibration and fusion: Converting a score to a likelihood ratio* by GS Morrison
#
# In[ ]:
# Stats
import numpy as np
import seaborn as sns
from scipy.special import logit, expit
from scipy.stats import norm as f_norm, skewnorm
# Off the shelf stats models
from sklearn.linear_model import LogisticRegression as lr
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.core.display import HTML
# In[ ]:
# Utils
def reshape_to_1d(array):
""" Reshape [] to [[]], a requirement from sklearn. """
return np.reshape(array,(-1, 1))
def get_logistic_estimates(clf):
""" Find the logistic reg parameter estimates """
return clf.intercept_[0], clf.coef_[0][0]
def get_effective_prior(tarN, nontarN):
return logit(tarN/(tarN+nontarN))
def logistic_logodds(x, β_0, β_1):
return β_0 + β_1*x
def log_likelihood_ratio_density(tar_density, nontar_density):
def func(x):
return np.log(tar_density(x)/nontar_density(x))
return func
# Data
def generate_data(nontar_rv, tar_rv, nontarN, tarN):
""" Simulate the scores from two classes
Args:
nontar_rv (scipy.continuous_dist): rv for w0 class
tar_rv (scipy.continuous_dist): rv for w1 class
"""
w0_sample = nontar_rv.rvs(nontarN)
w1_sample = tar_rv.rvs(tarN)
X = np.concatenate([w0_sample, w1_sample])
y = [0 for d in w0_sample] + [1 for d in w1_sample]
return X, y
# Validation
def logistic_calibration_validation(x, y, test_x, nontarN, tarN, nontar_rv, tar_rv):
""" Validate llr from logistic regression using the formula for two gaussians.
Args:
x (np.array): raw scores
y (np.array): labels w0 or w1
test_x (np.array): validation dataset to visually inspect the calibrated scores
nontar_rv (scipy.continuous_dist): rv for w0
tar_rv (scipy.continuous_dist): rv for w1
"""
# True llr
llr_density = log_likelihood_ratio_density(tar_rv.pdf, nontar_rv.pdf)
llr_true = [llr_density(x) for x in test_x]
# Estimated llr
x_1d = reshape_to_1d(x)
clf = lr(random_state=0).fit(x_1d, y)
β_0, β_1 = get_logistic_estimates(clf)
lo_preds = [logistic_logodds(x, β_0, β_1) for x in test_x]
effective_prior = get_effective_prior(tarN, nontarN)
llr_preds = [(lo - effective_prior) for lo in lo_preds]
return llr_true, llr_preds
def plot_true_and_predicted_llr(test_x, llr_true, llr_preds, title):
(
sns.lineplot(
'x',
'value',
hue='variable',
data=(
pd.DataFrame({
"x": test_x,
"llr_true": llr_true,
"llr_preds": llr_preds})
.melt("x")
)
)
.set_title(title)
)
# main
def run(nontar_rv, tar_rv, nontarN, tarN, title):
X, y = generate_data(nontar_rv, tar_rv, nontarN, tarN)
sns.distplot(X[:nontarN])
sns.distplot(X[nontarN:])
plt.title("Class-conditional score distributions")
plt.show()
# Validation sample scores
test_x = np.linspace(-1, 1, 15) #np.linspace(-3, 3, 40)
llr_true, llr_preds = logistic_calibration_validation(X, y, test_x, nontarN, tarN, nontar_rv, tar_rv)
plot_true_and_predicted_llr(test_x, llr_true, llr_preds, title)
# In[ ]:
nontarN = 1_000
tarN = 1_000
nontar_rv = f_norm(-3, 0.5)
tar_rv = f_norm(-2, 0.5)
run(nontar_rv, tar_rv, nontarN, tarN, "Balanced dataset with normally distributed scores")
# In[ ]:
tarN = 1_000
nontarN = int(7*tarN)
nontar_rv = f_norm(-3, 0.5)
tar_rv = f_norm(-2, 0.5)
run(nontar_rv, tar_rv, nontarN, tarN, "Imbalanced dataset with normally distributed scores")
# In[ ]:
tarN = 1_000
nontarN = 1_000
nontar_rv = skewnorm(a=4, loc=-0.5, scale=0.5)
tar_rv = skewnorm(a=-4, loc=0.5, scale=0.5)
run(nontar_rv, tar_rv, nontarN, tarN, "Balanced dataset with skew-normally distributed scores")