!pip install datasets &> /dev/null import pickle import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set_context(context="paper", font_scale=1.5) # load sklearn model we built previously with open('models/bag_emb_esm1v_rep7868aav2.pkl', 'rb') as f: bag_emb_esm1v = pickle.load(f) # Check to ensure md5sum of the model is b2adfa4cf9be0b8614ab2b0c6aeee622 !md5sum models/bag_emb_esm1v_rep7868aav2.pkl # Wild type sequence of Rep78 from AAV2 rep78_seq = 'MPGFYEIVIKVPSDLDGHLPGISDSFVNWVAEKEWELPPDSDMDLNLIEQAPLTVAEKLQRDFLTEWRRVSKAPEALFFVQFEKGESYFHMHVLVETTGVKSMVLGRFLSQIREKLIQRIYRGIEPTLPNWFAVTKTRNGAGGGNKVVDECYIPNYLLPKTQPELQWAWTNMEQYLSACLNLTERKRLVAQHLTHVSQTQEQNKENQNPNSDAPVIRSKTSARYMELVGWLVDKGITSEKQWIQEDQASYISFNAASNSRSQIKAALDNAGKIMSLTKTAPDYLVGQQPVEDISSNRIYKILELNGYDPQYAASVFLGWATKKFGKRNTIWLFGPATTGKTNIAEAIAHTVPFYGCVNWTNENFPFNDCVDKMVIWWEEGKMTAKVVESAKAILGGSKVRVDQKCKSSAQIDPTPVIVTSNTNMCAVIDGNSTTFEHQQPLQDRMFKFELTRRLDHDFGKVTKQEVKDFFRWAKDHVVEVEHEFYVKKGGAKKRPAPSDADISEPKRVRESVAQPSTSDAEASINYADRYQNKCSRHVGMNLMLFPCRQCERMNQNSNICFTHGQKDCLECFPVSESQPVSVVKKAYQKLCYIHHIMGKVPDACTACDLVNVDLDDCIFEQ' len(rep78_seq) # change M225G. This will be considered wild type sequence from here on. Python string is zero-indexed wt_seq = rep78_seq[:224] + 'G' + rep78_seq[225:] len(wt_seq) # ensure length remains same after substitution esm1v_checkpoint = "facebook/esm1v_t33_650M_UR90S_1" # The AutoTokenizer class automatically retrieve the model's configuration, pretrained weights, # or vocabulary from the name of the checkpoint. from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(esm1v_checkpoint) # Similar to the AutoTokenizer class, AutoModel has a from_pretrained() method # to load the weights of a pretrained model from transformers import AutoModel import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModel.from_pretrained(esm1v_checkpoint).to(device) def get_esm1v_rep(rep78_variant): """ Converts rep78 protein sequence variant to fixed dimensional vector representation using ESM1v model. rep78 sequence variant -> token encodings -> token embeddings -> last hidden state -> mean -> fixed-dimensional vector representation """ # encode string and convert the tokens to Pytorch tensors tokz = tokenizer(rep78_variant, return_tensors="pt") # Place model inputs on the GPU inputs = {k:v.to(device) for k,v in tokz.items() if k in tokenizer.model_input_names} # Extract last hidden states with torch.no_grad(): # disable automatic calculation of the gradient for inference last_hidden_state = model(**inputs).last_hidden_state # Return vector for [CLS] token and place the final hidden state back on the CPU as NumPy array return torch.mean(last_hidden_state, dim=1).cpu().numpy() def scoring_func(sequence: str): """ scoring function that takes in a string protein sequence, and outputs a fitness value. """ # protein sequence to fixed dimensional vector representation using ESM1v model seq_repr = get_esm1v_rep(sequence) # fixed dimensional vector representation to fitness value pred = bag_emb_esm1v.predict(seq_repr) return pred[0] scoring_func(wt_seq) scoring_func(rep78_seq) from scripts.mh_mcmc_sampler import sample_one_chain starter_seq = rep78_seq sampled_sequences = sample_one_chain(rep78_seq, starter_seq, n_steps=10, # n_steps is a hyperparameter scoring_func=scoring_func, is_accepted_kwargs={"temperature":0.01}, trust_radius=3) sampled_seqs_df = pd.DataFrame(sampled_sequences) sampled_seqs_df from functools import partial def find_mutated_aa(best_seq, starter_seq): """Return aminoacid substitution between two protein sequences of the same length""" mutated_aa = [starter_seq[i]+str(i+1)+best_seq[i] for i in range(len(starter_seq)) if starter_seq[i] != best_seq[i]] return mutated_aa mut_in_rep78 = partial(find_mutated_aa, starter_seq=rep78_seq) sampled_seqs_df['mutated_aa'] = sampled_seqs_df['sequences'].apply(mut_in_rep78) sampled_seqs_df # for a given sequence-to-function model, run 3,500 evolutionary trajectories in parallel for 3,000 iterations from scripts.mh_mcmc_sampler import init_traj_seq, sample_one_chain import dask starter_seq = rep78_seq delayed_results = [] for i in range(10): # sample 3500 independent chains #starter_seq = init_traj_seq(rep78_seq, trust_radius=3) starter_seq = rep78_seq sampled_sequences = dask.delayed(sample_one_chain)(rep78_seq, starter_seq, n_steps=3000, # 3000 iterations scoring_func=scoring_func, is_accepted_kwargs={"temperature":0.01}, trust_radius=3) delayed_results.append(sampled_sequences) results = dask.compute(*delayed_results) # Convert everything into a single DataFrame sampled_sequences_df = pd.concat([pd.DataFrame(r) for r in results]) def find_best_seq_traj(results): """Find the best protein sequence in each evolutionary trajectory.""" best_seq_traj = [] for traj, r in enumerate(results): df = pd.DataFrame(r) df['trajectory'] = traj df_accept = df.query('accept == True') best_seq_traj.append(df_accept.iloc[[np.argmax(df_accept['scores'].values)]]) best_seq_traj = pd.concat(best_seq_traj) best_seq_traj['mutated_aa'] = best_seq_traj['sequences'].apply(mut_in_rep78) return best_seq_traj # find best protein sequence in each trajectory traj_01_10 = find_best_seq_traj(results) traj_01_10 traj_01_10.to_csv('data/traj_01_10.csv') sampled_sequences_df.to_csv('data/sampled_sequences.csv') traj1_scores = results[0]['scores'] # Plot fitness as a function of iterations for one evolutionary trajectory plt.figure(figsize=(12, 6), dpi=800) sns.lineplot(traj1_scores) plt.xlabel('iterations'); plt.ylabel('fitness'); plt.title('Evolutionary trajectory 1'); plt.show() traj1_scores = results[1]['scores'] # Plot fitness as a function of iterations for one evolutionary trajectory plt.figure(figsize=(12, 6), dpi=800) sns.lineplot(traj1_scores) plt.xlabel('iterations'); plt.ylabel('fitness'); plt.title('Evolutionary trajectory 2'); plt.show()