# Install for running in colab
!pip install evaluate datasets transformers[torch] &> /dev/null
!apt install git-lfs &> /dev/null

# To share your model with the community
# First store your authentication token from the Hugging Face website and then execute this cell
# Make sure to get token with WRITE access
from huggingface_hub import notebook_login

notebook_login()

# Import libraries
from pathlib import Path
import os
import pickle

import numpy as np
import pandas as pd
import re
from functools import partial

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context(context="paper", font_scale=1.5)

# Load the Rep78 DMS data for supervised training saved as pkl file
# fitness_by_mutation_rep7868aav2 = 'https://github.com/arjan-hada/protein-variant-prediction/blob/master/data/fitness_by_mutation_rep7868aav2.pkl'

with open('data/fitness_by_mutation_rep7868aav2.pkl', 'rb') as f: fitness_by_mutation = pickle.load(f)
fitness_by_mutation.head()

# median_fitness_wt value is used to normalize the entire dataset
# such that wt fitness corresponds to a value of 1
median_fitness_wt = fitness_by_mutation.loc[-1, 'median_fitness']

sequences = fitness_by_mutation["sequence"].tolist()
fitness = fitness_by_mutation.loc[:, 'median_fitness'].values # obtain target

# transformation to normalize WT fitness value to 1
fitness_norm = (fitness - np.min(fitness))/(median_fitness_wt -np.min(fitness))

sns.histplot(np.log10(fitness_by_mutation['median_fitness'].values), kde=True)
plt.xlabel('log10(fitness)')
plt.ylabel('Density')
plt.title("Distribution of fitness values");

sns.histplot(fitness_by_mutation['median_fitness'].values, kde=True)
plt.xlabel('fitness')
plt.ylabel('Density')
plt.title("Distribution of fitness values");

from sklearn.model_selection import train_test_split

train_sequences, test_sequences, train_labels, test_labels = train_test_split(sequences, fitness_norm,
                                                                              test_size=0.2, random_state=42)

sns.histplot(train_labels, kde=True)
plt.xlabel('fitness')
plt.ylabel('Density')
plt.title("Distribution of train fitness values");

sns.histplot(test_labels, kde=True)
plt.xlabel('fitness')
plt.ylabel('Density')
plt.title("Distribution of test fitness values");

model_checkpoint = "facebook/esm2_t12_35M_UR50D"

# The AutoTokenizer class automatically retrieve the model's configuration, pretrained weights,
# or vocabulary from the name of the checkpoint.
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer(train_sequences[0])

train_tokenized = tokenizer(train_sequences)
test_tokenized = tokenizer(test_sequences)

from datasets import Dataset
train_ds = Dataset.from_dict(train_tokenized)
test_ds = Dataset.from_dict(test_tokenized)

train_ds

train_ds = train_ds.add_column("labels", train_labels)
test_ds = test_ds.add_column("labels", test_labels)
train_ds

import gc
import torch
gc.collect()
torch.cuda.empty_cache()

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1)

from evaluate import load
import numpy as np

metric_spearmanr = load("spearmanr")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return metric_spearmanr.compute(predictions=predictions, references=labels)

torch.cuda.is_available() # check whether a GPU is available or not; use GPU

!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32

model_name = model_checkpoint.split("/")[-1]

lr,bs = 2e-5,16 # prefer bs=32 or 64 but not possible in colab
wd,epochs = 0.01,20

args = TrainingArguments(f"{model_name}-finetuned-rep7868aav2-v1",
                         learning_rate=lr, fp16=True,
                         evaluation_strategy="epoch",
                         save_strategy = "epoch",
                         per_device_train_batch_size=bs,
                         per_device_eval_batch_size=2*bs,
                         num_train_epochs=epochs,
                         weight_decay=wd,
                         load_best_model_at_end=True,
                         push_to_hub=True,)

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train();

history = trainer.state.log_history
pd.DataFrame(history).head()

# code from: https://github.com/agemagician/ProtTrans/blob/master/Fine-Tuning/PT5_LorA_Finetuning_per_prot.ipynb
# Get loss, val_loss, and the computed metric from history
loss = [x['loss'] for x in history if 'loss' in x]
val_loss = [x['eval_loss'] for x in history if 'eval_loss' in x]

# Get spearman (for regression) or accuracy value (for classification)
if [x['eval_spearmanr'] for x in history if 'eval_spearmanr' in x] != []:
    metric = [x['eval_spearmanr'] for x in history if 'eval_spearmanr' in x]
else:
    metric = [x['eval_accuracy'] for x in history if 'eval_accuracy' in x]

epochs = [x['epoch'] for x in history if 'loss' in x]

# Create a figure with two y-axes
fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()

# Plot loss and val_loss on the first y-axis
line1 = ax1.plot(epochs, loss, label='train_loss')
line2 = ax1.plot(range(1,21), val_loss, label='val_loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')

# Plot the computed metric on the second y-axis
line3 = ax2.plot(range(1,21), metric, color='red', label='val_metric')
ax2.set_ylabel('Metric')
ax2.set_ylim([0, 1])

# Combine the lines from both y-axes and create a single legend
lines = line1 + line2 + line3
labels = [line.get_label() for line in lines]
ax1.legend(lines, labels, loc='lower left')

# Show the plot
plt.title("Training History")
plt.show()

trainer.save_model("models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1")
#model.save_pretrained("path/to/model")

# save the model to Hugging Face Hub
trainer.push_to_hub(commit_message="Training completed!")

# Put both models to the same device
model=model.to("cpu")
model_reload=AutoModelForSequenceClassification.from_pretrained(
    "models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1").to("cpu")

# Iterate through the parameters of the two models and compare the data
for param1, param2 in zip(model.parameters(), model_reload.parameters()):
    if not torch.equal(param1.data, param2.data):
        print("Models have different weights")
        break
else:
    print("Models have identical weights")

test_ds

from torch.utils.data import DataLoader
from tqdm import tqdm

# Set the device to use
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# make compatible with torch DataLoader
test_set = test_ds.with_format("torch", device=device)

# Create a dataloader for the test dataset
test_dataloader = DataLoader(test_set, batch_size=16, shuffle=False)

# Put the model in evaluation mode
model.eval()

# Make predictions on the test dataset
predictions = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        #add batch results(logits) to predictions
        predictions += model(input_ids, attention_mask=attention_mask).logits.tolist()

# Regression
from scipy import stats
spearmanr = stats.spearmanr(a=predictions, b=test_ds['labels'], axis=0)
print(spearmanr)

from sklearn.metrics import PredictionErrorDisplay
fig, (ax0, ax1) = plt.subplots(figsize=(12, 6), nrows=1, ncols=2)

# plot actual vs predicted values
PredictionErrorDisplay.from_predictions(
    test_ds['labels'],
    predictions,
    ax=ax0,
    kind='actual_vs_predicted',
    scatter_kwargs={"alpha":0.5}
)
ax0.plot([], [], " ", label=f"Spearman r: {np.round(spearmanr.statistic, 4)}")
ax0.legend(loc="best")
ax0.axis('tight')

PredictionErrorDisplay.from_predictions(
    test_ds['labels'],
    np.ravel(predictions),
    kind='residual_vs_predicted',
    ax=ax1,
    scatter_kwargs={"alpha":0.5}
)

ax1.plot([], [], " ", label=f"Spearman r: {np.round(spearmanr.statistic, 4)}")
ax1.legend(loc="best")
ax1.axis('tight')

plt.tight_layout()
plt.show();

# predicted fitness for first entry in test_ds
predictions[0][0]

# Actual fitness for first entry in test_ds
test_ds['labels'][0]

# check to ensure test sequences are in correct order in test_ds
print(tokenizer(test_sequences[0])['input_ids'] == test_ds['input_ids'][0])
print(tokenizer(test_sequences[100])['input_ids'] == test_ds['input_ids'][100])

# Download final model from colab
!zip -r models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1.zip models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1/

# Download that zip file
from google.colab import files
files.download("models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1.zip")