# Install for running in colab
!pip install biopython fair-esm -U kaleido &> /dev/null
In Moderna Therapeutics's study titled "Therapeutic Enzyme Engineering Using a Generative Neural Network", scientists turbocharge ornithine transcarbamylase (OTC), a critical player in the urea cycle combating a rare metabolic ailment. Their secret sauce? Marrying insights from nature's evolution dance with a dash of artificial evolution guided by savvy deep learning models. Meanwhile, in the paper "Efficient evolution of human antibodies from general protein language models", authors play protein origami, deftly evolving human antibodies sans the cheat sheet on target antigen specifics binding specificity, or protein structure. No experimental data for model supervision; both approaches propose beneficial mutations in a zero-shot manner.
Crafting functional protein sequences in the protein engineering arena is a puzzle akin to navigating a vast cosmic expanse. Picture a protein with 200 amino acids; a mere switcheroo of one amino acid spawns a array of 4000 potential variants (200*20). My previous notebooks used machine learning as the Sherpa guiding directed evolution when armed with experimental compass. But what if the experimental compass is missing? Can we still map out a path to promising protein sequences by divining the protein's mutational horoscope?
Nature leaves us clues, imprinting functional wisdom in protein sequences. Enter the unsupervised models, revealing higher functional variant tales from the sequence data troves, as showcased in the article "Language models enable zero-shot prediction of the effects of mutations on protein function". These unsupervised maestros for predicting protein variant effects fall into three troupes: hybrid protein language models (PLMs), PLMs, Alignment based models, and Inverse folding models.
Our playbook? This notebook orchestrates and contrasts these model from each class, aiming to spotlight protein sequence gems that can lighten the load of experimental quests in directed evolution. The playbook uses competitive models from the benchmarks set by ProteinGym](https://proteingym.org/), providing a comprehensive evaluation of these models' capabilities in predicting the mutational landscape of proteins.
Goals
From the hybrid PLMs class, we will be using MSA Transformer to predict mutational landscape for encapsidation protein 22K.
query_url = "https://rest.uniprot.org/uniprotkb/A0A7L4WH77.fasta"
import requests
from Bio import SeqIO
from io import StringIO
def read_fasta_from_url(url):
# Download the FASTA file content from the web link
response = requests.get(url)
if response.status_code == 200:
# Parse the content using Biopython's SeqIO
fasta_content = StringIO(response.text)
record = next(SeqIO.parse(fasta_content, 'fasta'))
sequence = str(record.seq)
return sequence
else:
print(f"Failed to retrieve data. HTTP Status Code: {response.status_code}")
protein_sequence = read_fasta_from_url(query_url)
print(f"Protein Sequence: {protein_sequence}")
print(f"Protein Sequence Length: {len(protein_sequence)}")
Protein Sequence: MAPKKKLQLPPPPTDEEEYWDSQAEEVLDEEEEEDMMEDWESLDEEASEAEEVSDETPSPSVAFPSPAPQKSATGSSMATTSAPQASPALPVRRPNRRWDTTGTRAGKSKQPPPLAQEQQQRQGYRSWRGHKNAIVACLQDCGGNISFARRFLLYHHGVAFPRNILHYYRHLYSPYCTGGSSSNSSGHTEAKATG Protein Sequence Length: 195
# Save FASTA file locally for future reference
# The -O option in wget allows you to specify the name of the output file.
!wget $query_url -O data/Encapsidation_protein_22K.fasta
--2023-12-18 21:53:32-- https://rest.uniprot.org/uniprotkb/A0A7L4WH77.fasta Resolving rest.uniprot.org (rest.uniprot.org)... 193.62.193.81 Connecting to rest.uniprot.org (rest.uniprot.org)|193.62.193.81|:443... connected. HTTP request sent, awaiting response... 200 OK Length: unspecified [text/plain] Saving to: ‘data/Encapsidation_protein_22K.fasta’ data/Encapsidation_ [ <=> ] 309 --.-KB/s in 0s 2023-12-18 21:53:32 (185 MB/s) - ‘data/Encapsidation_protein_22K.fasta’ saved [309]
The sequence alignment for encapsidation protein 22K was generated using MMSeq2 from ColabFold v1.5.3 server.
# Number of entries in the MSA file
!grep -c "^>" data/Encapsidation_protein_22K.a3m
124
!head -n 9 data/Encapsidation_protein_22K.a3m
#195 1 >101 MAPKKKLQLPPPPTDEEEYWDSQAEEVLDEEEEEDMMEDWESLDEEASEAEEVSDETPSPSVAFPSPAPQKSATGSSMATTSAPQASPALPVRRPNRRWDTTGTRAGKSKQPPPLAQEQQQRQGYRSWRGHKNAIVACLQDCGGNISFARRFLLYHHGVAFPRNILHYYRHLYSPYCTGGSSSNSSGHTEAKATG >UniRef100_A0A7L4WJM1 270 0.923 1.137E-78 0 194 195 0 193 194 MAPKKKLQLP-PPPTDEEEYWDSQAEEVLDEEEEDMMEDWESLDEEASEAEEVSDETPSPSVAFPSPAPQKSATGSSMATTSAPQASPALPVRRPNRRWDTTGTRAGKSKQPPPLAQEQQQRQGYRSWRGHKNAIVACLQDCGGNISFARRFLLYHHGVAFPRNILHYYRHLYSPYCTGGSSSNSSGHTEAKATG >UniRef100_A0A3Q9HLG8 259 0.892 7.754E-75 0 194 195 0 193 194 MAPKKKLQLP-PPPTDEEEYWDSQAEEVLDEEEEDMMEDWESLDEEASEAEEVSDGTPSPSVASPSPAPQKSATGPSMATTSAPQAPPALPVRRPNRRWDTTGTRAGKSKQPPPLAQEQQQRQGYRSWRGHKNAIVACLQDCGGNISFARRFLLYHHGVAFPRNILHYYRHLYSPYCTGGSGSNSSGHTEAKAPG >UniRef100_A0A3Q9HL92 258 0.888 1.457E-74 0 194 195 0 196 197 MAPKKKLQLPPPPPTDEEEYWDSQAEEVLDEEEEDTMEDWDSLDEEASEAEEVSDETPSPSVAFPSPAPQKSATGPSMATTSAPQAPPALPVRRPNRRWDTTGTRAGKSKQPPPLAQEQQQRQGYRSWRGHKNAIVACLQDCGGNISFARRFLLYHHGVAFPRNILHYYRHLYSPYYTGgsGSGSNSSGHTEAKATG
def generate_mutant_triplet(protein_sequence):
'''
Input:
protein_sequence: the input protein sequence for which you want to generate mutant triplets.
Output:
This script will generate all possible mutant triplets for the input protein sequence.
'''
# Define the amino acid alphabet (one-letter codes)
amino_acids = "ACDEFGHIKLMNPQRSTVWY"
# Create a list to store the mutant sequences
mutants = []
# Iterate through each position in the protein sequence
for position in range(len(protein_sequence)):
# Generate mutants for each amino acid in the alphabet
for aa in amino_acids:
# Create a mutant sequence by replacing the original amino acid
mutant = protein_sequence[position] + str(position+1) + aa
mutants.append(mutant)
# Print the list of mutant sequences
return mutants
mutant = generate_mutant_triplet(protein_sequence)
import pandas as pd
Encapsidation_protein_22K_mutants_df = pd.DataFrame({
"mutant": mutant
})
Encapsidation_protein_22K_mutants_df['wildtype'] = Encapsidation_protein_22K_mutants_df['mutant'].str[:-1]
Encapsidation_protein_22K_mutants_df['mutation'] = Encapsidation_protein_22K_mutants_df['mutant'].str[-1]
Encapsidation_protein_22K_mutants_df.to_csv('data/Encapsidation_protein_22K_mutants_unlabeled.csv', index=False)
# We meed the MSA file in .a3m format and csv file with mutant triplet to run the script below
!ls data/
Encapsidation_protein_22K.a3m Encapsidation_protein_22K_mutants_unlabeled.csv Encapsidation_protein_22K.fasta
# Script to generate mutational landscape
# predict_esm.py script from: https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py
!python scripts/predict_esm.py \
--model-location esm_msa1b_t12_100M_UR50S \
--sequence $protein_sequence \
--dms-input ./data/Encapsidation_protein_22K_mutants_unlabeled.csv \
--mutation-col mutant \
--dms-output ./data/Encapsidation_protein_22K_mutants_labeled.csv \
--offset-idx 1 \
--scoring-strategy masked-marginals \
--msa-path ./data/Encapsidation_protein_22K.a3m
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm_msa1b_t12_100M_UR50S.pt" to /root/.cache/torch/hub/checkpoints/esm_msa1b_t12_100M_UR50S.pt Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm_msa1b_t12_100M_UR50S-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm_msa1b_t12_100M_UR50S-contact-regression.pt Transferred model to GPU 100% 196/196 [01:43<00:00, 1.89it/s]
import pandas as pd
# Read the csv file containing mutation score from MSA Transformer model
fitness_prot22k_msa1b = pd.read_csv('data/Encapsidation_protein_22K_mutants_labeled.csv',
index_col=0)
fitness_prot22k_msa1b
mutant | wildtype | mutation | esm_msa1b_t12_100M_UR50S | |
---|---|---|---|---|
0 | M1A | M1 | A | -16.325127 |
1 | M1C | M1 | C | -16.986792 |
2 | M1D | M1 | D | -18.815874 |
3 | M1E | M1 | E | -17.576103 |
4 | M1F | M1 | F | -13.433237 |
... | ... | ... | ... | ... |
3895 | G195S | G195 | S | -9.930399 |
3896 | G195T | G195 | T | -13.784996 |
3897 | G195V | G195 | V | -12.468152 |
3898 | G195W | G195 | W | -14.053451 |
3899 | G195Y | G195 | Y | -15.699408 |
3900 rows × 4 columns
# Long to wide format conversion for plotting heatmap
fitness_prot22k_msa1b_wide = fitness_prot22k_msa1b.pivot_table(index='mutation',
columns='wildtype',
values='esm_msa1b_t12_100M_UR50S',
sort=False)
from pathlib import Path
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import Image
def create_heatmap(values, title='Predicted mutation effects', plot_interactive=True):
# Create a directory to save the images
images = Path('images')
images.mkdir(parents=True, exist_ok=True)
# Put 0 as midpoint for divergent colorscale
midpoint = (0 - values.min().min()) / (values.max().max() - values.min().min())
# Define custom colorscale for blue (negative) to red (positive)
custom_colorscale = [[0, 'midnightblue'],
[midpoint, 'white'],
[1, 'red']]
# Create heatmap trace
heatmap_trace = go.Heatmap(
x=values.columns,
y=values.index,
z=values,
colorscale=custom_colorscale
)
# Create layout with equal aspect ratio
layout = go.Layout(
title=title,
xaxis=dict(title='wildtype'),
yaxis=dict(title='mutation', autorange='reversed')
)
# Create figure
fig = go.Figure(data=[heatmap_trace], layout=layout)
if plot_interactive:
# Show the plot
fig.show()
else:
# Save the plot as a PNG file
image_path = images/f'{title}.png'
pio.write_image(fig, image_path)
# Display the saved PNG file
display(Image(filename=image_path))
create_heatmap(fitness_prot22k_msa1b_wide,
title='Predicted mutation effects using esm_msa1b_t12_100M_UR50S',
plot_interactive=False)
Heatmap which shows us hotspots for mutations that are beneficial or detrimental to the function of the protein.
Protein Language Models (PLMs) are LLMs trained on large protein databases encompassing all known sequences such as VESPA, ESM2, and ESM1v.
PLMs adeptly learn evolutionary constraints that generalize across protein families.
PLM both bidirectional and autoregressive generative models particularly excel in scenarios involving small families with sparse homologs. However, they generally underperform family-specific models for larger families.
# Script to generate mutational landscape
# predict_esm.py script from: https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py
!python scripts/predict_esm.py \
--model-location esm2_t33_650M_UR50D esm1v_t33_650M_UR90S_1 esm1v_t33_650M_UR90S_2 esm1v_t33_650M_UR90S_3 esm1v_t33_650M_UR90S_4 esm1v_t33_650M_UR90S_5 esm1b_t33_650M_UR50S \
--sequence $protein_sequence \
--dms-input ./data/Encapsidation_protein_22K_mutants_labeled.csv \
--mutation-col mutant \
--dms-output ./data/Encapsidation_protein_22K_mutants_labeled.csv \
--offset-idx 1 \
--scoring-strategy masked-marginals
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt Transferred model to GPU 100% 197/197 [00:07<00:00, 27.46it/s] Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_1.pt" to /root/.cache/torch/hub/checkpoints/esm1v_t33_650M_UR90S_1.pt /usr/local/lib/python3.10/dist-packages/esm/pretrained.py:215: UserWarning: Regression weights not found, predicting contacts will not produce correct results. warnings.warn( Transferred model to GPU 100% 197/197 [00:06<00:00, 31.84it/s] Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_2.pt" to /root/.cache/torch/hub/checkpoints/esm1v_t33_650M_UR90S_2.pt /usr/local/lib/python3.10/dist-packages/esm/pretrained.py:215: UserWarning: Regression weights not found, predicting contacts will not produce correct results. warnings.warn( Transferred model to GPU 100% 197/197 [00:06<00:00, 32.32it/s] Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_3.pt" to /root/.cache/torch/hub/checkpoints/esm1v_t33_650M_UR90S_3.pt /usr/local/lib/python3.10/dist-packages/esm/pretrained.py:215: UserWarning: Regression weights not found, predicting contacts will not produce correct results. warnings.warn( Transferred model to GPU 100% 197/197 [00:06<00:00, 32.35it/s] Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_4.pt" to /root/.cache/torch/hub/checkpoints/esm1v_t33_650M_UR90S_4.pt /usr/local/lib/python3.10/dist-packages/esm/pretrained.py:215: UserWarning: Regression weights not found, predicting contacts will not produce correct results. warnings.warn( Transferred model to GPU 100% 197/197 [00:06<00:00, 32.31it/s] Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1v_t33_650M_UR90S_5.pt" to /root/.cache/torch/hub/checkpoints/esm1v_t33_650M_UR90S_5.pt /usr/local/lib/python3.10/dist-packages/esm/pretrained.py:215: UserWarning: Regression weights not found, predicting contacts will not produce correct results. warnings.warn( Transferred model to GPU 100% 197/197 [00:06<00:00, 32.27it/s] Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S.pt Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm1b_t33_650M_UR50S-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S-contact-regression.pt Transferred model to GPU 100% 197/197 [00:06<00:00, 32.30it/s]
# Read the csv file containing mutation score from ESM models
fitness_prot22k_esm = pd.read_csv('data/Encapsidation_protein_22K_mutants_labeled.csv',
index_col=0)
fitness_prot22k_esm
Unnamed: 0 | mutant | wildtype | mutation | esm_msa1b_t12_100M_UR50S | esm2_t33_650M_UR50D | esm1v_t33_650M_UR90S_1 | esm1v_t33_650M_UR90S_2 | esm1v_t33_650M_UR90S_3 | esm1v_t33_650M_UR90S_4 | esm1v_t33_650M_UR90S_5 | esm1b_t33_650M_UR50S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | M1A | M1 | A | -16.325127 | -8.098816 | -8.235819 | -5.609891 | -6.854110 | -6.774120 | -6.800508 | -6.751205 |
1 | 1 | M1C | M1 | C | -16.986792 | -11.144790 | -10.752127 | -7.939667 | -9.681019 | -10.263803 | -10.269108 | -9.223083 |
2 | 2 | M1D | M1 | D | -18.815874 | -10.257328 | -9.317606 | -6.818065 | -8.292774 | -8.394552 | -8.780309 | -8.850616 |
3 | 3 | M1E | M1 | E | -17.576103 | -9.041828 | -9.080865 | -6.556577 | -7.643363 | -7.661278 | -7.244850 | -7.663014 |
4 | 4 | M1F | M1 | F | -13.433237 | -10.600483 | -9.636066 | -7.294350 | -8.199809 | -8.164932 | -8.874701 | -9.020489 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3895 | 3895 | G195S | G195 | S | -9.930399 | 0.551918 | 0.538798 | 0.282230 | 0.335325 | 0.482387 | 0.505919 | 0.841090 |
3896 | 3896 | G195T | G195 | T | -13.784996 | 0.215631 | 0.102809 | 0.036828 | -0.015700 | 0.123055 | 0.254141 | 0.318797 |
3897 | 3897 | G195V | G195 | V | -12.468152 | -0.506383 | -0.631024 | -0.503328 | -0.551369 | -0.467469 | -0.509331 | -0.360502 |
3898 | 3898 | G195W | G195 | W | -14.053451 | -3.002845 | -2.173256 | -1.882592 | -1.959077 | -2.038070 | -2.005057 | -2.339551 |
3899 | 3899 | G195Y | G195 | Y | -15.699408 | -2.778253 | -2.155714 | -1.750376 | -1.710923 | -1.869018 | -1.828054 | -2.049861 |
3900 rows × 12 columns
# Long to wide format conversion for plotting heatmap
fitness_prot22k_esm2_650M_wide = fitness_prot22k_esm.pivot_table(index='mutation',
columns='wildtype',
values='esm2_t33_650M_UR50D',
sort=False)
fitness_prot22k_esm1v_1_wide = fitness_prot22k_esm.pivot_table(index='mutation',
columns='wildtype',
values='esm1v_t33_650M_UR90S_1',
sort=False)
create_heatmap(fitness_prot22k_esm2_650M_wide,
title='Predicted mutation effects from esm2_t33_650M_UR50D',
plot_interactive=False)
create_heatmap(fitness_prot22k_esm1v_1_wide,
title='Predicted mutation effects from esm1v_t33_650M_UR90S_1',
plot_interactive=False)
To do:
esm2_t36_3B_UR50D, and esm2_t48_15B_UR50D models.
To do:
...
"The key conceptual advance is that by learning the rules underlying local evolution, we can construct a global evolutionary “vector field” that we show can (1) predict the root (or potentially multiple roots) of observed evolutionary trajectories, (2) order protein sequences in evolutionary time, and (3) identify the mutational strategies that drive these trajectories."