# Install for running in colab
!pip install biopython fair-esm -U kaleido &> /dev/null

query_url =  "https://rest.uniprot.org/uniprotkb/A0A7L4WH77.fasta"

import requests
from Bio import SeqIO
from io import StringIO

def read_fasta_from_url(url):
    # Download the FASTA file content from the web link
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the content using Biopython's SeqIO
        fasta_content = StringIO(response.text)
        record = next(SeqIO.parse(fasta_content, 'fasta'))
        sequence = str(record.seq)

        return sequence
    else:
        print(f"Failed to retrieve data. HTTP Status Code: {response.status_code}")

protein_sequence = read_fasta_from_url(query_url)
print(f"Protein Sequence: {protein_sequence}")
print(f"Protein Sequence Length: {len(protein_sequence)}")

# Save FASTA file locally for future reference
# The -O option in wget allows you to specify the name of the output file.
!wget $query_url -O data/Encapsidation_protein_22K.fasta

# Number of entries in the MSA file
!grep -c "^>" data/Encapsidation_protein_22K.a3m

!head -n 9 data/Encapsidation_protein_22K.a3m

def generate_mutant_triplet(protein_sequence):

    '''
    Input:
    protein_sequence: the input protein sequence for which you want to generate mutant triplets.

    Output:
    This script will generate all possible mutant triplets for the input protein sequence.
    '''

    # Define the amino acid alphabet (one-letter codes)
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"

    # Create a list to store the mutant sequences
    mutants = []

    # Iterate through each position in the protein sequence
    for position in range(len(protein_sequence)):

        # Generate mutants for each amino acid in the alphabet
        for aa in amino_acids:
            # Create a mutant sequence by replacing the original amino acid
            mutant = protein_sequence[position] + str(position+1) + aa
            mutants.append(mutant)

    # Print the list of mutant sequences
    return mutants

mutant = generate_mutant_triplet(protein_sequence)

import pandas as pd

Encapsidation_protein_22K_mutants_df = pd.DataFrame({
    "mutant": mutant
})

Encapsidation_protein_22K_mutants_df['wildtype'] = Encapsidation_protein_22K_mutants_df['mutant'].str[:-1]
Encapsidation_protein_22K_mutants_df['mutation'] = Encapsidation_protein_22K_mutants_df['mutant'].str[-1]

Encapsidation_protein_22K_mutants_df.to_csv('data/Encapsidation_protein_22K_mutants_unlabeled.csv', index=False)

# We meed the MSA file in .a3m format and csv file with mutant triplet to run the script below
!ls data/

# Script to generate mutational landscape
# predict_esm.py script from: https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py

!python scripts/predict_esm.py \
    --model-location esm_msa1b_t12_100M_UR50S \
    --sequence $protein_sequence \
    --dms-input ./data/Encapsidation_protein_22K_mutants_unlabeled.csv \
    --mutation-col mutant \
    --dms-output ./data/Encapsidation_protein_22K_mutants_labeled.csv \
    --offset-idx 1 \
    --scoring-strategy masked-marginals \
    --msa-path ./data/Encapsidation_protein_22K.a3m

import pandas as pd

# Read the csv file containing mutation score from MSA Transformer model
fitness_prot22k_msa1b = pd.read_csv('data/Encapsidation_protein_22K_mutants_labeled.csv',
                                    index_col=0)
fitness_prot22k_msa1b

# Long to wide format conversion for plotting heatmap
fitness_prot22k_msa1b_wide = fitness_prot22k_msa1b.pivot_table(index='mutation',
                                                               columns='wildtype',
                                                               values='esm_msa1b_t12_100M_UR50S',
                                                               sort=False)

from pathlib import Path
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import Image

def create_heatmap(values, title='Predicted mutation effects', plot_interactive=True):
    # Create a directory to save the images
    images = Path('images')
    images.mkdir(parents=True, exist_ok=True)

    # Put 0 as midpoint for divergent colorscale
    midpoint = (0 - values.min().min()) / (values.max().max() - values.min().min())

    # Define custom colorscale for blue (negative) to red (positive)
    custom_colorscale = [[0, 'midnightblue'],
                         [midpoint, 'white'],
                         [1, 'red']]

    # Create heatmap trace
    heatmap_trace = go.Heatmap(
        x=values.columns,
        y=values.index,
        z=values,
        colorscale=custom_colorscale
    )

    # Create layout with equal aspect ratio
    layout = go.Layout(
        title=title,
        xaxis=dict(title='wildtype'),
        yaxis=dict(title='mutation', autorange='reversed')
    )

    # Create figure
    fig = go.Figure(data=[heatmap_trace], layout=layout)

    if plot_interactive:
        # Show the plot
        fig.show()
    else:
        # Save the plot as a PNG file
        image_path = images/f'{title}.png'
        pio.write_image(fig, image_path)

        # Display the saved PNG file
        display(Image(filename=image_path))

create_heatmap(fitness_prot22k_msa1b_wide,
               title='Predicted mutation effects using esm_msa1b_t12_100M_UR50S',
               plot_interactive=False)

# Script to generate mutational landscape
# predict_esm.py script from: https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py

!python scripts/predict_esm.py \
    --model-location esm2_t33_650M_UR50D esm1v_t33_650M_UR90S_1 esm1v_t33_650M_UR90S_2 esm1v_t33_650M_UR90S_3 esm1v_t33_650M_UR90S_4 esm1v_t33_650M_UR90S_5 esm1b_t33_650M_UR50S \
    --sequence $protein_sequence \
    --dms-input ./data/Encapsidation_protein_22K_mutants_labeled.csv \
    --mutation-col mutant \
    --dms-output ./data/Encapsidation_protein_22K_mutants_labeled.csv \
    --offset-idx 1 \
    --scoring-strategy masked-marginals

# Read the csv file containing mutation score from ESM models
fitness_prot22k_esm = pd.read_csv('data/Encapsidation_protein_22K_mutants_labeled.csv',
                                    index_col=0)
fitness_prot22k_esm

# Long to wide format conversion for plotting heatmap
fitness_prot22k_esm2_650M_wide = fitness_prot22k_esm.pivot_table(index='mutation',
                                                                 columns='wildtype',
                                                                 values='esm2_t33_650M_UR50D',
                                                                 sort=False)

fitness_prot22k_esm1v_1_wide = fitness_prot22k_esm.pivot_table(index='mutation',
                                                                 columns='wildtype',
                                                                 values='esm1v_t33_650M_UR90S_1',
                                                                 sort=False)

create_heatmap(fitness_prot22k_esm2_650M_wide,
               title='Predicted mutation effects from esm2_t33_650M_UR50D',
               plot_interactive=False)

create_heatmap(fitness_prot22k_esm1v_1_wide,
               title='Predicted mutation effects from esm1v_t33_650M_UR90S_1',
               plot_interactive=False)