!pip install datasets &> /dev/null

import pickle
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context(context="paper", font_scale=1.5)

# load sklearn model we built previously
with open('models/bag_emb_esm1v_rep7868aav2.pkl', 'rb') as f: bag_emb_esm1v = pickle.load(f)

# Check to ensure md5sum of the model is b2adfa4cf9be0b8614ab2b0c6aeee622
!md5sum models/bag_emb_esm1v_rep7868aav2.pkl

# Wild type sequence of Rep78 from AAV2
rep78_seq = 'MPGFYEIVIKVPSDLDGHLPGISDSFVNWVAEKEWELPPDSDMDLNLIEQAPLTVAEKLQRDFLTEWRRVSKAPEALFFVQFEKGESYFHMHVLVETTGVKSMVLGRFLSQIREKLIQRIYRGIEPTLPNWFAVTKTRNGAGGGNKVVDECYIPNYLLPKTQPELQWAWTNMEQYLSACLNLTERKRLVAQHLTHVSQTQEQNKENQNPNSDAPVIRSKTSARYMELVGWLVDKGITSEKQWIQEDQASYISFNAASNSRSQIKAALDNAGKIMSLTKTAPDYLVGQQPVEDISSNRIYKILELNGYDPQYAASVFLGWATKKFGKRNTIWLFGPATTGKTNIAEAIAHTVPFYGCVNWTNENFPFNDCVDKMVIWWEEGKMTAKVVESAKAILGGSKVRVDQKCKSSAQIDPTPVIVTSNTNMCAVIDGNSTTFEHQQPLQDRMFKFELTRRLDHDFGKVTKQEVKDFFRWAKDHVVEVEHEFYVKKGGAKKRPAPSDADISEPKRVRESVAQPSTSDAEASINYADRYQNKCSRHVGMNLMLFPCRQCERMNQNSNICFTHGQKDCLECFPVSESQPVSVVKKAYQKLCYIHHIMGKVPDACTACDLVNVDLDDCIFEQ'
len(rep78_seq)

# change M225G. This will be considered wild type sequence from here on. Python string is zero-indexed
wt_seq = rep78_seq[:224] + 'G' + rep78_seq[225:]
len(wt_seq) # ensure length remains same after substitution

esm1v_checkpoint = "facebook/esm1v_t33_650M_UR90S_1"
# The AutoTokenizer class automatically retrieve the model's configuration, pretrained weights,
# or vocabulary from the name of the checkpoint.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(esm1v_checkpoint)

# Similar to the AutoTokenizer class, AutoModel has a from_pretrained() method
# to load the weights of a pretrained model
from transformers import AutoModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(esm1v_checkpoint).to(device)

def get_esm1v_rep(rep78_variant):
    """
    Converts rep78 protein sequence variant to fixed dimensional vector
    representation using ESM1v model.

    rep78 sequence variant -> token encodings -> token embeddings ->
    last hidden state -> mean -> fixed-dimensional vector representation
    """
    # encode string and convert the tokens to Pytorch tensors
    tokz = tokenizer(rep78_variant, return_tensors="pt")

    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in tokz.items()
              if k in tokenizer.model_input_names}

    # Extract last hidden states
    with torch.no_grad(): # disable automatic calculation of the gradient for inference
        last_hidden_state = model(**inputs).last_hidden_state

    # Return vector for [CLS] token and place the final hidden state back on the CPU as NumPy array
    return torch.mean(last_hidden_state, dim=1).cpu().numpy()

def scoring_func(sequence: str):
    """
    scoring function that takes in a string protein sequence, and
    outputs a fitness value.
    """
    # protein sequence to fixed dimensional vector representation using ESM1v model
    seq_repr = get_esm1v_rep(sequence)
    # fixed dimensional vector representation to fitness value
    pred = bag_emb_esm1v.predict(seq_repr)
    return pred[0]

scoring_func(wt_seq)

scoring_func(rep78_seq)

from scripts.mh_mcmc_sampler import sample_one_chain

starter_seq = rep78_seq
sampled_sequences = sample_one_chain(rep78_seq,
                                     starter_seq,
                                     n_steps=10, # n_steps is a hyperparameter
                                     scoring_func=scoring_func,
                                     is_accepted_kwargs={"temperature":0.01},
                                     trust_radius=3)
sampled_seqs_df = pd.DataFrame(sampled_sequences)
sampled_seqs_df

from functools import partial
def find_mutated_aa(best_seq, starter_seq):
    """Return aminoacid substitution between two protein sequences of the same length"""
    mutated_aa = [starter_seq[i]+str(i+1)+best_seq[i] for i in range(len(starter_seq))
                  if starter_seq[i] != best_seq[i]]
    return mutated_aa

mut_in_rep78 = partial(find_mutated_aa, starter_seq=rep78_seq)

sampled_seqs_df['mutated_aa'] = sampled_seqs_df['sequences'].apply(mut_in_rep78)
sampled_seqs_df

# for a given sequence-to-function model, run 3,500 evolutionary trajectories in parallel for 3,000 iterations
from scripts.mh_mcmc_sampler import init_traj_seq, sample_one_chain
import dask

starter_seq = rep78_seq

delayed_results = []

for i in range(10): # sample 3500 independent chains
    #starter_seq = init_traj_seq(rep78_seq, trust_radius=3)
    starter_seq = rep78_seq
    sampled_sequences = dask.delayed(sample_one_chain)(rep78_seq,
                                                       starter_seq,
                                                       n_steps=3000, # 3000 iterations
                                                       scoring_func=scoring_func,
                                                       is_accepted_kwargs={"temperature":0.01},
                                                       trust_radius=3)
    delayed_results.append(sampled_sequences)

results = dask.compute(*delayed_results)

# Convert everything into a single DataFrame
sampled_sequences_df = pd.concat([pd.DataFrame(r) for r in results])

def find_best_seq_traj(results):
    """Find the best protein sequence in each evolutionary trajectory."""
    best_seq_traj = []
    for traj, r in enumerate(results):
        df = pd.DataFrame(r)
        df['trajectory'] = traj
        df_accept = df.query('accept == True')
        best_seq_traj.append(df_accept.iloc[[np.argmax(df_accept['scores'].values)]])

    best_seq_traj = pd.concat(best_seq_traj)

    best_seq_traj['mutated_aa'] = best_seq_traj['sequences'].apply(mut_in_rep78)

    return best_seq_traj

# find best protein sequence in each trajectory
traj_01_10 = find_best_seq_traj(results)
traj_01_10

traj_01_10.to_csv('data/traj_01_10.csv')

sampled_sequences_df.to_csv('data/sampled_sequences.csv')

traj1_scores = results[0]['scores']
# Plot fitness as a function of iterations for one evolutionary trajectory
plt.figure(figsize=(12, 6), dpi=800)
sns.lineplot(traj1_scores)
plt.xlabel('iterations');
plt.ylabel('fitness');
plt.title('Evolutionary trajectory 1');
plt.show()

traj1_scores = results[1]['scores']
# Plot fitness as a function of iterations for one evolutionary trajectory
plt.figure(figsize=(12, 6), dpi=800)
sns.lineplot(traj1_scores)
plt.xlabel('iterations');
plt.ylabel('fitness');
plt.title('Evolutionary trajectory 2');
plt.show()