#!/usr/bin/env python # coding: utf-8 # # About # # ## Experiments # # Each "experiment" consists in a set of identical models, trained on different folds of data (with both consistent model and features parameters). # # (Reason: The metrics for each model tend to vary a little, so training them on many folds enables a more accurate comparison across models/features) # # ## Metrics # # The main metrics tracked are: # - Biased MAE (from [#9](https://github.com/related-sciences/nxontology-ml/issues/9)) # - The class "Biases" (also [#9](https://github.com/related-sciences/nxontology-ml/issues/9)) are: # ```python # biases = { # "01-disease-subtype": 0.245, # "02-disease-root": 0.245, # "03-disease-area": 0.5, # "04-non-disease": 0.01, # } # # Note: I didn't have the courage to set `biases['"04-non-disease']` to 0. # ``` # - For both training and cv sets: ROC, F1 & model Loss ([CatBoost's `MultiClass`](https://catboost.ai/en/docs/concepts/loss-functions-multiclassification)) # # ## Parameters # # Constant parameters across each experiment (currently): # - Shuffled inputs # - 25 folds stratified CV # - Best model is selected (based on BiasedMAE) # - 5000 training iterations # - Learning rate of 0.5 # # Loading Experiments Data # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') from experimentation.model_metadata import ( df_from_all_experiments, NUMERICAL_COLUMNS, EXPERIMENT_MODEL_DIR, ) from typing import List import pandas as pd import seaborn as sns sns.set_theme() # In[2]: df = df_from_all_experiments() # Sort by median BiasedMAE experiment_median_mae = df.groupby("experiment_name")["biased_mae"].median().to_dict() df = df.sort_values( by="experiment_name", key=lambda e: e.apply(experiment_median_mae.get) ) # # Experiments by Biased MAEs # In[3]: sns.boxplot(data=df, x="biased_mae", y="experiment_name") # Notes about experiments: # - `baseline`: (Mostly) default parameters for network only features (as presented in [#7](https://github.com/related-sciences/nxontology-ml/pull/7)) # - `mae`: Uses the (custom coded) `BiasedMAE` metric as [CatBoost's custom `eval_metric`](https://catboost.ai/en/docs/concepts/python-usages-examples#custom-loss-function-eval-metric) # - `d7`: CatBoost tree depth of 7 (default is 6) - although not obvious here, `d=7` usually helps when using more features # - `knn`: Use Faiss NN tree proximity features # - `lda`: 3 Class LDA for dimensionality reduction # - `pcaX`: PCA of `X` components for dimensionality reduction # # For MAE comparison: # - A random classifier with real class weights prior has a biased MAE of `0.284` # - A random classifier with uniform class weights prior has a biased MAE of `0.348` # # Top experiment (`pca128_d7_mae`) analysis # ## CatBoost's `MetricVisualizer` # In[4]: import catboost experiment_dir = EXPERIMENT_MODEL_DIR / "20230821_085648_pca128_d7_mae" w = catboost.MetricVisualizer(experiment_dir.as_posix(), subdirs=True) w.start() # Notes: # - This visualization runs JavaScript and won't be displayed on the web # - The experiment data is needed locally for this code to render # # Metrics Cross-Analysis # In[5]: excluded_columns = { "roc_auc", # Correlation with biased_roc_auc "validation_top_AUC:type=Mu", # Correlation with biased_roc_auc "report_F1:class=2", # Correlation with biased_report_F1:class=2 "mae", # Correlation with biased_mae } columns = list({"experiment_name"} | (set(NUMERICAL_COLUMNS) - excluded_columns)) gbdf = df[columns] # In[6]: sns.pairplot(gbdf, hue="experiment_name") # Notes: # - `learn_`: Computed by Catboost during training # - `validation_`: Computed by Catboost during CV # - `report_`: From SKLearn's Classifier report # # Experiment Training Durations # In[7]: df["duration_s"] = ( df["feature_building_d"] + df["model_training_d"] ).dt.total_seconds() sns.catplot(data=df, kind="bar", x="duration_s", y="experiment_name") # Notes: # - These duration are in seconds, per fold # # Experiment number of iterations # In[8]: sns.catplot( data=df, x="tree_cnt", y="experiment_name", kind="bar", ) # Notes: # - For the models nearing `5000`, a little more performance could be gained by raising the number of max iterations # # MAE Baselines # In[9]: import numpy as np from experimentation.model_utils import CLASS_WEIGHTS, mean_absolute_error def mae_baseline(class_weights: np.ndarray) -> float: biased_mae = 0 for i, w_i in enumerate(CLASS_WEIGHTS): for j, w_j in enumerate(class_weights): y_true = np.zeros(4) y_true[i] = 1 y = np.zeros(4) y[j] = 1 biased_mae += ( w_i * w_j * mean_absolute_error(np.array([y_true]), np.array([y])) ) # Biased by default return biased_mae print(f"Baseline MAE (with weighted class prior): {mae_baseline(CLASS_WEIGHTS):.3f}") print( f"Baseline MAE (with uniform class prior): {mae_baseline(np.array([.25, .25, .25, .25])):.3f}" ) # In[10]: # Empirical check N = 100000 print( mean_absolute_error( y_true=np.array([np.random.multinomial(1, CLASS_WEIGHTS) for _ in range(N)]), y_probas=np.array([np.random.multinomial(1, CLASS_WEIGHTS) for _ in range(N)]), ) ) # # Some Conclusions (mostly from past experiments) # * Having a custom eval metric is very important # * More iterations is good # * LR declines over 1 (Sweet spot ~.5) - TODO: Add proof # * GPUs do not support custom eval # * Adding weights to samples does not help - TODO: Add proof # * (Depth helps but makes training slower - TODO: Add better proof) # # Possible Next steps # * Add GPT 4 label to features # * Node error analysis: # 1. Find nodes where the biggest mistakes are made # 2. Try to come up with pattern in data (& features) # 3. Craft useful features if possible # * (Add feature importances to this report?)