Each "experiment" consists in a set of identical models, trained on different folds of data (with both consistent model and features parameters).
(Reason: The metrics for each model tend to vary a little, so training them on many folds enables a more accurate comparison across models/features)
The main metrics tracked are:
biases = {
"01-disease-subtype": 0.245,
"02-disease-root": 0.245,
"03-disease-area": 0.5,
"04-non-disease": 0.01,
}
# Note: I didn't have the courage to set `biases['"04-non-disease']` to 0.
MultiClass
)Constant parameters across each experiment (currently):
%load_ext autoreload
%autoreload 2
from experimentation.model_metadata import (
df_from_all_experiments,
NUMERICAL_COLUMNS,
EXPERIMENT_MODEL_DIR,
)
from typing import List
import pandas as pd
import seaborn as sns
sns.set_theme()
df = df_from_all_experiments()
# Sort by median BiasedMAE
experiment_median_mae = df.groupby("experiment_name")["biased_mae"].median().to_dict()
df = df.sort_values(
by="experiment_name", key=lambda e: e.apply(experiment_median_mae.get)
)
sns.boxplot(data=df, x="biased_mae", y="experiment_name")
<Axes: xlabel='biased_mae', ylabel='experiment_name'>
Notes about experiments:
baseline
: (Mostly) default parameters for network only features (as presented in #7)mae
: Uses the (custom coded) BiasedMAE
metric as CatBoost's custom eval_metric
d7
: CatBoost tree depth of 7 (default is 6) - although not obvious here, d=7
usually helps when using more featuresknn
: Use Faiss NN tree proximity featureslda
: 3 Class LDA for dimensionality reductionpcaX
: PCA of X
components for dimensionality reductionFor MAE comparison:
0.284
0.348
pca128_d7_mae
) analysis¶MetricVisualizer
¶import catboost
experiment_dir = EXPERIMENT_MODEL_DIR / "20230821_085648_pca128_d7_mae"
w = catboost.MetricVisualizer(experiment_dir.as_posix(), subdirs=True)
w.start()
# Notes:
# - This visualization runs JavaScript and won't be displayed on the web
# - The experiment data is needed locally for this code to render
MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))
excluded_columns = {
"roc_auc", # Correlation with biased_roc_auc
"validation_top_AUC:type=Mu", # Correlation with biased_roc_auc
"report_F1:class=2", # Correlation with biased_report_F1:class=2
"mae", # Correlation with biased_mae
}
columns = list({"experiment_name"} | (set(NUMERICAL_COLUMNS) - excluded_columns))
gbdf = df[columns]
sns.pairplot(gbdf, hue="experiment_name")
<seaborn.axisgrid.PairGrid at 0x151978fd0>
Notes:
learn_
: Computed by Catboost during trainingvalidation_
: Computed by Catboost during CVreport_
: From SKLearn's Classifier reportdf["duration_s"] = (
df["feature_building_d"] + df["model_training_d"]
).dt.total_seconds()
sns.catplot(data=df, kind="bar", x="duration_s", y="experiment_name")
<seaborn.axisgrid.FacetGrid at 0x151979210>
Notes:
sns.catplot(
data=df,
x="tree_cnt",
y="experiment_name",
kind="bar",
)
<seaborn.axisgrid.FacetGrid at 0x155196050>
Notes:
5000
, a little more performance could be gained by raising the number of max iterationsimport numpy as np
from experimentation.model_utils import CLASS_WEIGHTS, mean_absolute_error
def mae_baseline(class_weights: np.ndarray) -> float:
biased_mae = 0
for i, w_i in enumerate(CLASS_WEIGHTS):
for j, w_j in enumerate(class_weights):
y_true = np.zeros(4)
y_true[i] = 1
y = np.zeros(4)
y[j] = 1
biased_mae += (
w_i * w_j * mean_absolute_error(np.array([y_true]), np.array([y]))
) # Biased by default
return biased_mae
print(f"Baseline MAE (with weighted class prior): {mae_baseline(CLASS_WEIGHTS):.3f}")
print(
f"Baseline MAE (with uniform class prior): {mae_baseline(np.array([.25, .25, .25, .25])):.3f}"
)
Baseline MAE (with weighted class prior): 0.284 Baseline MAE (with uniform class prior): 0.348
# Empirical check
N = 100000
print(
mean_absolute_error(
y_true=np.array([np.random.multinomial(1, CLASS_WEIGHTS) for _ in range(N)]),
y_probas=np.array([np.random.multinomial(1, CLASS_WEIGHTS) for _ in range(N)]),
)
)
0.28375