# Install for running in colab !pip install biopython fair-esm -U kaleido &> /dev/null query_url = "https://rest.uniprot.org/uniprotkb/A0A7L4WH77.fasta" import requests from Bio import SeqIO from io import StringIO def read_fasta_from_url(url): # Download the FASTA file content from the web link response = requests.get(url) if response.status_code == 200: # Parse the content using Biopython's SeqIO fasta_content = StringIO(response.text) record = next(SeqIO.parse(fasta_content, 'fasta')) sequence = str(record.seq) return sequence else: print(f"Failed to retrieve data. HTTP Status Code: {response.status_code}") protein_sequence = read_fasta_from_url(query_url) print(f"Protein Sequence: {protein_sequence}") print(f"Protein Sequence Length: {len(protein_sequence)}") # Save FASTA file locally for future reference # The -O option in wget allows you to specify the name of the output file. !wget $query_url -O data/Encapsidation_protein_22K.fasta # Number of entries in the MSA file !grep -c "^>" data/Encapsidation_protein_22K.a3m !head -n 9 data/Encapsidation_protein_22K.a3m def generate_mutant_triplet(protein_sequence): ''' Input: protein_sequence: the input protein sequence for which you want to generate mutant triplets. Output: This script will generate all possible mutant triplets for the input protein sequence. ''' # Define the amino acid alphabet (one-letter codes) amino_acids = "ACDEFGHIKLMNPQRSTVWY" # Create a list to store the mutant sequences mutants = [] # Iterate through each position in the protein sequence for position in range(len(protein_sequence)): # Generate mutants for each amino acid in the alphabet for aa in amino_acids: # Create a mutant sequence by replacing the original amino acid mutant = protein_sequence[position] + str(position+1) + aa mutants.append(mutant) # Print the list of mutant sequences return mutants mutant = generate_mutant_triplet(protein_sequence) import pandas as pd Encapsidation_protein_22K_mutants_df = pd.DataFrame({ "mutant": mutant }) Encapsidation_protein_22K_mutants_df['wildtype'] = Encapsidation_protein_22K_mutants_df['mutant'].str[:-1] Encapsidation_protein_22K_mutants_df['mutation'] = Encapsidation_protein_22K_mutants_df['mutant'].str[-1] Encapsidation_protein_22K_mutants_df.to_csv('data/Encapsidation_protein_22K_mutants_unlabeled.csv', index=False) # We meed the MSA file in .a3m format and csv file with mutant triplet to run the script below !ls data/ # Script to generate mutational landscape # predict_esm.py script from: https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py !python scripts/predict_esm.py \ --model-location esm_msa1b_t12_100M_UR50S \ --sequence $protein_sequence \ --dms-input ./data/Encapsidation_protein_22K_mutants_unlabeled.csv \ --mutation-col mutant \ --dms-output ./data/Encapsidation_protein_22K_mutants_labeled.csv \ --offset-idx 1 \ --scoring-strategy masked-marginals \ --msa-path ./data/Encapsidation_protein_22K.a3m import pandas as pd # Read the csv file containing mutation score from MSA Transformer model fitness_prot22k_msa1b = pd.read_csv('data/Encapsidation_protein_22K_mutants_labeled.csv', index_col=0) fitness_prot22k_msa1b # Long to wide format conversion for plotting heatmap fitness_prot22k_msa1b_wide = fitness_prot22k_msa1b.pivot_table(index='mutation', columns='wildtype', values='esm_msa1b_t12_100M_UR50S', sort=False) from pathlib import Path import plotly.graph_objects as go import plotly.io as pio from IPython.display import Image def create_heatmap(values, title='Predicted mutation effects', plot_interactive=True): # Create a directory to save the images images = Path('images') images.mkdir(parents=True, exist_ok=True) # Put 0 as midpoint for divergent colorscale midpoint = (0 - values.min().min()) / (values.max().max() - values.min().min()) # Define custom colorscale for blue (negative) to red (positive) custom_colorscale = [[0, 'midnightblue'], [midpoint, 'white'], [1, 'red']] # Create heatmap trace heatmap_trace = go.Heatmap( x=values.columns, y=values.index, z=values, colorscale=custom_colorscale ) # Create layout with equal aspect ratio layout = go.Layout( title=title, xaxis=dict(title='wildtype'), yaxis=dict(title='mutation', autorange='reversed') ) # Create figure fig = go.Figure(data=[heatmap_trace], layout=layout) if plot_interactive: # Show the plot fig.show() else: # Save the plot as a PNG file image_path = images/f'{title}.png' pio.write_image(fig, image_path) # Display the saved PNG file display(Image(filename=image_path)) create_heatmap(fitness_prot22k_msa1b_wide, title='Predicted mutation effects using esm_msa1b_t12_100M_UR50S', plot_interactive=False) # Script to generate mutational landscape # predict_esm.py script from: https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py !python scripts/predict_esm.py \ --model-location esm2_t33_650M_UR50D esm1v_t33_650M_UR90S_1 esm1v_t33_650M_UR90S_2 esm1v_t33_650M_UR90S_3 esm1v_t33_650M_UR90S_4 esm1v_t33_650M_UR90S_5 esm1b_t33_650M_UR50S \ --sequence $protein_sequence \ --dms-input ./data/Encapsidation_protein_22K_mutants_labeled.csv \ --mutation-col mutant \ --dms-output ./data/Encapsidation_protein_22K_mutants_labeled.csv \ --offset-idx 1 \ --scoring-strategy masked-marginals # Read the csv file containing mutation score from ESM models fitness_prot22k_esm = pd.read_csv('data/Encapsidation_protein_22K_mutants_labeled.csv', index_col=0) fitness_prot22k_esm # Long to wide format conversion for plotting heatmap fitness_prot22k_esm2_650M_wide = fitness_prot22k_esm.pivot_table(index='mutation', columns='wildtype', values='esm2_t33_650M_UR50D', sort=False) fitness_prot22k_esm1v_1_wide = fitness_prot22k_esm.pivot_table(index='mutation', columns='wildtype', values='esm1v_t33_650M_UR90S_1', sort=False) create_heatmap(fitness_prot22k_esm2_650M_wide, title='Predicted mutation effects from esm2_t33_650M_UR50D', plot_interactive=False) create_heatmap(fitness_prot22k_esm1v_1_wide, title='Predicted mutation effects from esm1v_t33_650M_UR90S_1', plot_interactive=False)