#!/usr/bin/env python
# coding: utf-8

# # About
# 
# ## Experiments
# 
# Each "experiment" consists in a set of identical models, trained on different folds of data (with both consistent model and features parameters).
# 
# (Reason: The metrics for each model tend to vary a little, so training them on many folds enables a more accurate comparison across models/features)
# 
# ## Metrics
# 
# The main metrics tracked are:
# - Biased MAE (from [#9](https://github.com/related-sciences/nxontology-ml/issues/9))
# - The class "Biases" (also [#9](https://github.com/related-sciences/nxontology-ml/issues/9)) are:
# ```python
# biases = {
#     "01-disease-subtype": 0.245,
#     "02-disease-root": 0.245,
#     "03-disease-area": 0.5,
#     "04-non-disease": 0.01,
# }
# # Note: I didn't have the courage to set `biases['"04-non-disease']` to 0.
# ```
# - For both training and cv sets: ROC, F1 & model Loss ([CatBoost's `MultiClass`](https://catboost.ai/en/docs/concepts/loss-functions-multiclassification))
# 
# ## Parameters
# 
# Constant parameters across each experiment (currently):
# - Shuffled inputs
# - 25 folds stratified CV
# - Best model is selected (based on BiasedMAE)
# - 5000 training iterations
# - Learning rate of 0.5

# # Loading Experiments Data

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

from experimentation.model_metadata import (
    df_from_all_experiments,
    NUMERICAL_COLUMNS,
    EXPERIMENT_MODEL_DIR,
)
from typing import List
import pandas as pd
import seaborn as sns

sns.set_theme()


# In[2]:


df = df_from_all_experiments()

# Sort by median BiasedMAE
experiment_median_mae = df.groupby("experiment_name")["biased_mae"].median().to_dict()
df = df.sort_values(
    by="experiment_name", key=lambda e: e.apply(experiment_median_mae.get)
)


# # Experiments by Biased MAEs

# In[3]:


sns.boxplot(data=df, x="biased_mae", y="experiment_name")


# Notes about experiments:
# - `baseline`: (Mostly) default parameters for network only features (as presented in [#7](https://github.com/related-sciences/nxontology-ml/pull/7))
# - `mae`: Uses the (custom coded) `BiasedMAE` metric as [CatBoost's custom `eval_metric`](https://catboost.ai/en/docs/concepts/python-usages-examples#custom-loss-function-eval-metric)
# - `d7`: CatBoost tree depth of 7 (default is 6) - although not obvious here, `d=7` usually helps when using more features
# - `knn`: Use Faiss NN tree proximity features
# - `lda`: 3 Class LDA for dimensionality reduction
# - `pcaX`: PCA of `X` components for dimensionality reduction
# 
# For MAE comparison: 
# - A random classifier with real class weights prior has a biased MAE of `0.284`
# - A random classifier with uniform class weights prior has a biased MAE of `0.348`

# # Top experiment (`pca128_d7_mae`) analysis

# ## CatBoost's `MetricVisualizer`

# In[4]:


import catboost

experiment_dir = EXPERIMENT_MODEL_DIR / "20230821_085648_pca128_d7_mae"
w = catboost.MetricVisualizer(experiment_dir.as_posix(), subdirs=True)
w.start()

# Notes:
# - This visualization runs JavaScript and won't be displayed on the web
# - The experiment data is needed locally for this code to render


# # Metrics Cross-Analysis

# In[5]:


excluded_columns = {
    "roc_auc",  # Correlation with biased_roc_auc
    "validation_top_AUC:type=Mu",  # Correlation with biased_roc_auc
    "report_F1:class=2",  # Correlation with biased_report_F1:class=2
    "mae",  # Correlation with biased_mae
}
columns = list({"experiment_name"} | (set(NUMERICAL_COLUMNS) - excluded_columns))
gbdf = df[columns]


# In[6]:


sns.pairplot(gbdf, hue="experiment_name")


# Notes:
# - `learn_`: Computed by Catboost during training
# - `validation_`: Computed by Catboost during CV
# - `report_`: From SKLearn's Classifier report

# # Experiment Training Durations

# In[7]:


df["duration_s"] = (
    df["feature_building_d"] + df["model_training_d"]
).dt.total_seconds()
sns.catplot(data=df, kind="bar", x="duration_s", y="experiment_name")


# Notes:
# - These duration are in seconds, per fold

# # Experiment number of iterations

# In[8]:


sns.catplot(
    data=df,
    x="tree_cnt",
    y="experiment_name",
    kind="bar",
)


# Notes:
# - For the models nearing `5000`, a little more performance could be gained by raising the number of max iterations

# # MAE Baselines

# In[9]:


import numpy as np
from experimentation.model_utils import CLASS_WEIGHTS, mean_absolute_error


def mae_baseline(class_weights: np.ndarray) -> float:
    biased_mae = 0
    for i, w_i in enumerate(CLASS_WEIGHTS):
        for j, w_j in enumerate(class_weights):
            y_true = np.zeros(4)
            y_true[i] = 1
            y = np.zeros(4)
            y[j] = 1
            biased_mae += (
                w_i * w_j * mean_absolute_error(np.array([y_true]), np.array([y]))
            )  # Biased by default
    return biased_mae


print(f"Baseline MAE (with weighted class prior): {mae_baseline(CLASS_WEIGHTS):.3f}")
print(
    f"Baseline MAE (with uniform class prior): {mae_baseline(np.array([.25, .25, .25, .25])):.3f}"
)


# In[10]:


# Empirical check
N = 100000
print(
    mean_absolute_error(
        y_true=np.array([np.random.multinomial(1, CLASS_WEIGHTS) for _ in range(N)]),
        y_probas=np.array([np.random.multinomial(1, CLASS_WEIGHTS) for _ in range(N)]),
    )
)


# # Some Conclusions (mostly from past experiments)
# * Having a custom eval metric is very important
# * More iterations is good
# * LR declines over 1 (Sweet spot ~.5) - TODO: Add proof
# * GPUs do not support custom eval
# * Adding weights to samples does not help - TODO: Add proof
# * (Depth helps but makes training slower - TODO: Add better proof)

# # Possible Next steps
# * Add GPT 4 label to features
# * Node error analysis:
#   1. Find nodes where the biggest mistakes are made 
#   2. Try to come up with pattern in data (& features)
#   3. Craft useful features if possible
# * (Add feature importances to this report?)