# Install for running in colab !pip install evaluate datasets transformers[torch] &> /dev/null !apt install git-lfs &> /dev/null # To share your model with the community # First store your authentication token from the Hugging Face website and then execute this cell # Make sure to get token with WRITE access from huggingface_hub import notebook_login notebook_login() # Import libraries from pathlib import Path import os import pickle import numpy as np import pandas as pd import re from functools import partial import matplotlib.pyplot as plt import seaborn as sns sns.set_context(context="paper", font_scale=1.5) # Load the Rep78 DMS data for supervised training saved as pkl file # fitness_by_mutation_rep7868aav2 = 'https://github.com/arjan-hada/protein-variant-prediction/blob/master/data/fitness_by_mutation_rep7868aav2.pkl' with open('data/fitness_by_mutation_rep7868aav2.pkl', 'rb') as f: fitness_by_mutation = pickle.load(f) fitness_by_mutation.head() # median_fitness_wt value is used to normalize the entire dataset # such that wt fitness corresponds to a value of 1 median_fitness_wt = fitness_by_mutation.loc[-1, 'median_fitness'] sequences = fitness_by_mutation["sequence"].tolist() fitness = fitness_by_mutation.loc[:, 'median_fitness'].values # obtain target # transformation to normalize WT fitness value to 1 fitness_norm = (fitness - np.min(fitness))/(median_fitness_wt -np.min(fitness)) sns.histplot(np.log10(fitness_by_mutation['median_fitness'].values), kde=True) plt.xlabel('log10(fitness)') plt.ylabel('Density') plt.title("Distribution of fitness values"); sns.histplot(fitness_by_mutation['median_fitness'].values, kde=True) plt.xlabel('fitness') plt.ylabel('Density') plt.title("Distribution of fitness values"); from sklearn.model_selection import train_test_split train_sequences, test_sequences, train_labels, test_labels = train_test_split(sequences, fitness_norm, test_size=0.2, random_state=42) sns.histplot(train_labels, kde=True) plt.xlabel('fitness') plt.ylabel('Density') plt.title("Distribution of train fitness values"); sns.histplot(test_labels, kde=True) plt.xlabel('fitness') plt.ylabel('Density') plt.title("Distribution of test fitness values"); model_checkpoint = "facebook/esm2_t12_35M_UR50D" # The AutoTokenizer class automatically retrieve the model's configuration, pretrained weights, # or vocabulary from the name of the checkpoint. from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) tokenizer(train_sequences[0]) train_tokenized = tokenizer(train_sequences) test_tokenized = tokenizer(test_sequences) from datasets import Dataset train_ds = Dataset.from_dict(train_tokenized) test_ds = Dataset.from_dict(test_tokenized) train_ds train_ds = train_ds.add_column("labels", train_labels) test_ds = test_ds.add_column("labels", test_labels) train_ds import gc import torch gc.collect() torch.cuda.empty_cache() from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=1) from evaluate import load import numpy as np metric_spearmanr = load("spearmanr") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric_spearmanr.compute(predictions=predictions, references=labels) torch.cuda.is_available() # check whether a GPU is available or not; use GPU !export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32 model_name = model_checkpoint.split("/")[-1] lr,bs = 2e-5,16 # prefer bs=32 or 64 but not possible in colab wd,epochs = 0.01,20 args = TrainingArguments(f"{model_name}-finetuned-rep7868aav2-v1", learning_rate=lr, fp16=True, evaluation_strategy="epoch", save_strategy = "epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=2*bs, num_train_epochs=epochs, weight_decay=wd, load_best_model_at_end=True, push_to_hub=True,) trainer = Trainer( model, args, train_dataset=train_ds, eval_dataset=test_ds, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train(); history = trainer.state.log_history pd.DataFrame(history).head() # code from: https://github.com/agemagician/ProtTrans/blob/master/Fine-Tuning/PT5_LorA_Finetuning_per_prot.ipynb # Get loss, val_loss, and the computed metric from history loss = [x['loss'] for x in history if 'loss' in x] val_loss = [x['eval_loss'] for x in history if 'eval_loss' in x] # Get spearman (for regression) or accuracy value (for classification) if [x['eval_spearmanr'] for x in history if 'eval_spearmanr' in x] != []: metric = [x['eval_spearmanr'] for x in history if 'eval_spearmanr' in x] else: metric = [x['eval_accuracy'] for x in history if 'eval_accuracy' in x] epochs = [x['epoch'] for x in history if 'loss' in x] # Create a figure with two y-axes fig, ax1 = plt.subplots(figsize=(10, 5)) ax2 = ax1.twinx() # Plot loss and val_loss on the first y-axis line1 = ax1.plot(epochs, loss, label='train_loss') line2 = ax1.plot(range(1,21), val_loss, label='val_loss') ax1.set_xlabel('Epoch') ax1.set_ylabel('Loss') # Plot the computed metric on the second y-axis line3 = ax2.plot(range(1,21), metric, color='red', label='val_metric') ax2.set_ylabel('Metric') ax2.set_ylim([0, 1]) # Combine the lines from both y-axes and create a single legend lines = line1 + line2 + line3 labels = [line.get_label() for line in lines] ax1.legend(lines, labels, loc='lower left') # Show the plot plt.title("Training History") plt.show() trainer.save_model("models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1") #model.save_pretrained("path/to/model") # save the model to Hugging Face Hub trainer.push_to_hub(commit_message="Training completed!") # Put both models to the same device model=model.to("cpu") model_reload=AutoModelForSequenceClassification.from_pretrained( "models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1").to("cpu") # Iterate through the parameters of the two models and compare the data for param1, param2 in zip(model.parameters(), model_reload.parameters()): if not torch.equal(param1.data, param2.data): print("Models have different weights") break else: print("Models have identical weights") test_ds from torch.utils.data import DataLoader from tqdm import tqdm # Set the device to use device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) # make compatible with torch DataLoader test_set = test_ds.with_format("torch", device=device) # Create a dataloader for the test dataset test_dataloader = DataLoader(test_set, batch_size=16, shuffle=False) # Put the model in evaluation mode model.eval() # Make predictions on the test dataset predictions = [] with torch.no_grad(): for batch in tqdm(test_dataloader): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) #add batch results(logits) to predictions predictions += model(input_ids, attention_mask=attention_mask).logits.tolist() # Regression from scipy import stats spearmanr = stats.spearmanr(a=predictions, b=test_ds['labels'], axis=0) print(spearmanr) from sklearn.metrics import PredictionErrorDisplay fig, (ax0, ax1) = plt.subplots(figsize=(12, 6), nrows=1, ncols=2) # plot actual vs predicted values PredictionErrorDisplay.from_predictions( test_ds['labels'], predictions, ax=ax0, kind='actual_vs_predicted', scatter_kwargs={"alpha":0.5} ) ax0.plot([], [], " ", label=f"Spearman r: {np.round(spearmanr.statistic, 4)}") ax0.legend(loc="best") ax0.axis('tight') PredictionErrorDisplay.from_predictions( test_ds['labels'], np.ravel(predictions), kind='residual_vs_predicted', ax=ax1, scatter_kwargs={"alpha":0.5} ) ax1.plot([], [], " ", label=f"Spearman r: {np.round(spearmanr.statistic, 4)}") ax1.legend(loc="best") ax1.axis('tight') plt.tight_layout() plt.show(); # predicted fitness for first entry in test_ds predictions[0][0] # Actual fitness for first entry in test_ds test_ds['labels'][0] # check to ensure test sequences are in correct order in test_ds print(tokenizer(test_sequences[0])['input_ids'] == test_ds['input_ids'][0]) print(tokenizer(test_sequences[100])['input_ids'] == test_ds['input_ids'][100]) # Download final model from colab !zip -r models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1.zip models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1/ # Download that zip file from google.colab import files files.download("models/esm2_t12_35M_UR50D-finetuned-rep7868aav2-v1.zip")