#!/usr/bin/env python
# coding: utf-8

# **NB:** Precomputed predictions can be found in the Excel file `./data/precomputed_predictions.xlsx`

# In[1]:


import os, pickle, glob, csv
import prody
import numpy as np
import pandas as pd


# In[2]:


import rhapsody as rd

rd.initialSetup()


# # Analysis of human ROMK variants
# Since the structure of human ROMK has not been resolved yet, we will use a homology model from SWISS-MODEL.

# In[3]:


from prody import *

pdb_path = 'data/ROMK_homology_model.pdb'
pdb = prody.parsePDB(pdb_path).select('protein')

print(pdb['A'].getResnums())
pdb.getHierView()


# ## RHAPSODY predictions
# We will compute predictions with different environment models and with/without EVmutation feature included in the classifier. We will refer to the Rhapsody classifier without EVmutation feature as "classic".

# In[4]:


all_preds = {}

if os.path.isfile('local/all_preds.pkl'):
    all_preds = pickle.load(open('local/all_preds.pkl', 'rb'))
    print('Precomputed predictions imported.')
else:
    if not os.path.isdir('local'):
        os.mkdir('local')
    for mode in ['classic', 'EVmutation']:
        for env in ['chain', 'reduced', 'sliced']:
            case = f'{mode}-{env}'
            folder = os.path.join('local', case)
            if os.path.isdir(folder):
                pickle_file = os.path.join(folder, 'rhapsody-pickle.pkl')
                all_preds[case] = pickle.load(open(pickle_file, 'rb'))
                continue
            os.mkdir(folder)
            # run rhapsody
            if mode == 'classic':
                rh = rd.rhapsody('data/pph2-full.txt', query_type='PolyPhen2', 
                                 custom_PDB=pdb, force_env=env)
            else:
                rh = rd.rhapsody('data/pph2-full.txt', query_type='PolyPhen2', 
                                 main_classifier=rd.getDefaultClassifiers()['EVmut'],
                                 custom_PDB=pdb, force_env=env)
            # store files
            all_preds[case] = rh
            for f in glob.glob('rhapsody-*.*'):
                os.rename(f, os.path.join(folder, f))
    pickle.dump(all_preds, open('local/all_preds.pkl', 'wb'))


# Since ROMK protein is a tetramer, we expect to obtain better predictions when taking into account all monomers when computing dynamical features. This is accomplished by using "sliced" elastic network models.

# In[5]:


rh_ref_classic = all_preds['classic-sliced']
rh_ref_EVmut = all_preds['EVmutation-sliced']


# In[6]:


# print saturation mutagenesis table
rd.print_sat_mutagen_figure(
    'local/sat_mutagen-1', rh_ref_classic, fig_width=26, res_interval=(37, 202))
rd.print_sat_mutagen_figure(
    'local/sat_mutagen-2', rh_ref_classic, fig_width=26, res_interval=(202, 367))

# print residue-averaged predictions on PDB structure
mappedPDBs = rh_ref_classic.writePDBs(filename_prefix='local/rhapsody-PDB')


# In[7]:


from IPython.core.display import Image, display
display(Image('local/sat_mutagen-1.png', width=900, unconfined=True))
display(Image('local/sat_mutagen-2.png', width=900, unconfined=True))


# Rhapsody's output also contains predictions from other two methods, PolyPhen-2 and EVmutation.
# 
# In the table below, we show that Rhapsody and EVmutation have more balanced predictions than PolyPhen-2. The latter, in fact, predicts ~77% of all possible ROMK variants to be "deleterious". On the other hand, Rhapsody and EVmutation are unable to provide predictions for, respectively, ~16% and ~23% of variants.

# In[8]:


from collections import Counter

table = []
for method in ['classic-sliced', 'EVmutation-sliced', 
               'PolyPhen-2', 'EVmutation']:
    if 'sliced' in method:
        preds = all_preds[method].getPredictions()
        count = Counter(preds['path. class'])
    else:
        preds = all_preds['classic-sliced'].getPredictions()
        count = Counter(preds[f'{method} path. class'])
    # calculate fraction of predictions in each class
    dels = count.get('deleterious', 0) + count.get('prob.delet.', 0)
    neus = count.get('neutral', 0) + count.get('prob.neutral', 0)
    unks = count.get('?', 0)
    n = len(preds)
    if method == 'classic-sliced':
        method = 'Rhapsody'
    elif method == 'EVmutation-sliced':
        method = 'Rhapsody+EVmutation'
    table.append([method, dels/n*100, neus/n*100, unks/n*100])
df_stats = pd.DataFrame(table, columns=['method', 'del. SAVs', 'neu. SAVs', 'no pred.'])
pd.set_option('max_colwidth', 20)
pd.options.display.float_format = '{:,.1f}%'.format
df_stats


# ## Comparison with experimental results

# In[9]:


# import exp. results from file
reader = csv.reader(open('data/experimental_results.csv'))
# translation map for phenotype codes 
exp_map = {
    '-1': 'INC', # increased growth
    '0': 'NEU',  # no growth defect
    '1': 'INT',  # modest growth defect
    '2': 'DEL'   # severe growth defect
}

# rename classifier variants and prediction outputs
cases = list(all_preds.keys())
case_map = {
    'classic-chain': 'R_ch', 'classic-reduced': 'R_rd', 
    'classic-sliced': 'R_sl', 'EVmutation-chain': 'R+E_ch',
    'EVmutation-reduced': 'R+E_rd', 'EVmutation-sliced': 'R+E_sl'}

# fill table with exp. results and relative predictions for
# tested variants
table = []
for i,l in enumerate(reader):
    # skip header
    if i==0:
        continue
        
    # store SAV coordinates, exp. info and group info
    SAV = l[0]
    oSAV = 'P48048 '+l[1] if l[1] else 'P48048 '+SAV
    exp_fitness = exp_map[l[2]]
    group = l[3]
    pred_class = l[4]
    hSAV = f'({l[1][-3:]:3})' if l[1] else ' -'
    row = [SAV, oSAV, hSAV, exp_fitness, group, pred_class]
    
    # store predictions (scores and classes)
    for case in cases:
        rh = all_preds[case].getPredictions(SAV=oSAV)
        row.append(rh['path. prob.'])
        row.append(rh['path. class'])
    
    # store predictions from EVmutation and PolyPhen-2
    rh = rh_ref_classic.getPredictions(SAV=oSAV)
    for method in ['EVmutation', 'PolyPhen-2']:
        row.append(rh[f'{method} score'])
        row.append(rh[f'{method} path. class'])

    table.append(row)

# store table in a dataframe
columns = ['SAV', 'oSAV', 'hSAV', 'exp. fitness', 'group', 'pred.class']
for c in cases:
    columns.extend([f'{case_map[c]} score', case_map[c]])
for c in ['EVmut', 'PP2']:
    columns.extend([f'{c} score', c])
df = pd.DataFrame(table, columns=columns)

# print subset of columns
sel_cols = [c for c in columns if not (c=='oSAV' or c.endswith('score'))]
pd.set_option('max_colwidth', 15)
display(df[sel_cols])


# ## AUROC and other accuracy metrics

# In[10]:


class_map = {'neutral': 0, 'prob.neutral': 0, '?': -1,
             'deleterious': 1, 'prob.delet.': 1}
resampling=100

metrics = {}

for INC_label in [-1, 0, 1]:
    
    metrics[INC_label] = {}
    
    # define how to assign binary classification labels, in particular
    # for variants displaying "increased growth" (INC) phenotype
    label_map = {'DEL': 1, 'INT': 1, 'NEU': 0, 'INC': INC_label}
    true_labels = np.array([label_map[x] for x in df['exp. fitness']])

    # compute accuracy metrics
    for method in ['R_sl', 'R+E_sl', 'PP2', 'EVmut']:
        pscore = df[f'{method} score']
        if method == 'EVmut':
            pscore = -pscore
        pclass = np.array([class_map[c] for c in df[method]])
        # select valid predictions and true labels
        sel = np.logical_and(~np.isnan(pscore), true_labels!=-1)
        # compute accuracy metrics
        ds = rd.calcScoreMetrics(true_labels[sel], pscore[sel])
        dc = rd.calcClassMetrics(true_labels[sel], pclass[sel])
        # bootstrapped measures
        dsb = rd.calcScoreMetrics(true_labels[sel], pscore[sel], bootstrap=resampling)
        dcb = rd.calcClassMetrics(true_labels[sel], pclass[sel], bootstrap=resampling)
        metrics[INC_label][method] = {**ds , **dc, **dsb, **dcb}
    
    # for the sake of comparison, we also consider Rhapsody and PolyPhen-2's
    # predictions on the same subset of variants as EVmutation's
    for method in ['R_sl', 'PP2']:
        pscore = df[f'{method} score']
        pclass = np.array([class_map[c] for c in df[method]])
        # ignore SAVs with no EVmutation predictions
        sel = np.logical_and(~np.isnan(df[f'EVmut score']), true_labels!=-1)
        # compute accuracy metrics
        ds = rd.calcScoreMetrics(true_labels[sel], pscore[sel])
        dc = rd.calcClassMetrics(true_labels[sel], pclass[sel])
        # bootstrapped measures
        dsb = rd.calcScoreMetrics(true_labels[sel], pscore[sel], bootstrap=resampling)
        dcb = rd.calcClassMetrics(true_labels[sel], pclass[sel], bootstrap=resampling)
        metrics[INC_label][f'{method}*'] = {**ds , **dc, **dsb, **dcb}


# In[11]:


columns = ['method', 'INC class', 'AUROC', 'AUPRC', 'MCC',
           'precision', 'recall', 'F1 score',
           'precision (0)', 'recall (0)', 'F1 score (0)', 
           'precision (1)', 'recall (1)', 'F1 score (1)']
table = []
for method in ['R_sl', 'PP2', 'EVmut', 'R+E_sl', 'R_sl*', 'PP2*']:
    for INC_label in [-1, 0, 1]:
        row = [method, INC_label]
        d = metrics[INC_label][method]
        for col in columns[2:]:
            s = '{:.2f} ({:.2f}+/-{:.2f})'.format(
                d[col], d[f'mean {col}'], d[f'{col} std']
            )
            row.append(s)
        table.append(row)
df_stats = pd.DataFrame(table, columns=columns)
pd.set_option('max_colwidth', 20)
df_stats


# ## Figures

# In[12]:


import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16, 'font.family': 'Arial'})

legend = {
    'R_sl':   ('Rhapsody', '-', 'red'),
    'R+E_sl': ('Rhapsody+EVmutation', '--', 'red'),
    'PP2':    ('PolyPhen-2', '-', 'blue'),
    'EVmut':  ('EVmutation', '--', 'green'),
    'R_sl*':  ('Rhapsody*', ':', 'red'),
    'PP2*':   ('PolyPhen-2*', ':', 'blue')
}

fig, axes = plt.subplots(2, 3, figsize=(15,10))

for INC_label in [-1, 0, 1]:
    ax0 = axes[0, INC_label + 1]
    ax1 = axes[1, INC_label + 1]
    
    if INC_label == -1:
        ax0.set_title('no INC SAVs')
    elif INC_label == 0:
        ax0.set_title('INC = NEU')
    else:
        ax0.set_title('INC = DEL')
    ax1.set_title('bootstrap average')
    
    for method in ['R_sl', 'PP2', 'EVmut', 'R+E_sl', 'R_sl*', 'PP2*']:
        l = legend[method]
        metr = metrics[INC_label][method]
        ax0.plot(metr['ROC']['FPR'], metr['ROC']['TPR'],
                 ls=l[1], c=l[2], label=l[0])
        ax1.plot(metr['mean ROC']['FPR'], metr['mean ROC']['TPR'],
                 ls=l[1], c=l[2], label=l[0])

for ax in axes.flatten():
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
axes[0, 0].legend()

fig.tight_layout()
fig.savefig('local/ROCs.png', dpi=300)


# In[13]:


EVmut_cutoff = -rd.getSettings(print=False)['EVmutation_metrics']['optimal cutoff']


# In[14]:


# positive control SAVs
CTRL_SAVs = df[df['group'] == '0']

# SAVs with both Rhapsody and EVmutation predictions
subset = df[~np.isnan(df['EVmut score'])]
subset = subset[subset['group'] != '0']
DEL_SAVs = subset[subset['exp. fitness'] == 'DEL']
INT_SAVs = subset[subset['exp. fitness'] == 'INT']
NEU_SAVs = subset[subset['exp. fitness'] == 'NEU']

# SAVs without EVmutation predictions
subset = df[np.isnan(df['EVmut score'])]
subset = subset[subset['group'] !=0 ]
DEL_SAVs_rh = subset[subset['exp. fitness'] == 'DEL']
INT_SAVs_rh = subset[subset['exp. fitness'] == 'INT']
NEU_SAVs_rh = subset[subset['exp. fitness'] == 'NEU']
INC_SAVs_rh = subset[subset['exp. fitness'] == 'INC']


# In[15]:


EVmut_score = rh_ref_classic.getPredictions()['EVmutation score']
PP2_score = rh_ref_classic.getPredictions()['PolyPhen-2 score']
rhapsody_pprob = rh_ref_classic.getPredictions()['path. prob.']

from scipy.stats import spearmanr

sel = ~np.isnan(EVmut_score)
rho_EVm = spearmanr(rhapsody_pprob[sel], EVmut_score[sel])

sel = ~np.isnan(rhapsody_pprob)
rho_PP2 = spearmanr(rhapsody_pprob[sel], PP2_score[sel])

print(f'Spearman corr. with EVmutation: {rho_EVm[0]:5.3f}')
print(f'Spearman corr. with PolyPhen-2: {rho_PP2[0]:5.3f}')


# In[16]:


fig, axes = plt.subplots(1, 2, figsize=(12,7),
                         gridspec_kw={'width_ratios': [10, 4]})
plt.subplots_adjust(wspace=0.05)

ax1, ax2 = axes
ax1.scatter(EVmut_score, rhapsody_pprob, marker='.')
ax1.axvline(EVmut_cutoff, color='k', linestyle='--', lw=1, zorder=0)
ax1.axhline(0.5, color='k', linestyle='--', lw=1, zorder=0)

ax1.set_ylim((0,1))
ax1.set_xlabel('EVmutation score')
ax1.set_ylabel('Rhapsody path. probability')

ax1.scatter(CTRL_SAVs['EVmut score'], CTRL_SAVs['R_sl score'], s=80, color='k',  marker='^',
            label='positive\ncontrol', linewidths=.5, edgecolors='k')
ax1.scatter(NEU_SAVs['EVmut score'], NEU_SAVs['R_sl score'], s=80, color='green',  marker='o',
            label='no growth\ndefect', linewidths=.5, edgecolors='k')
ax1.scatter(INT_SAVs['EVmut score'], INT_SAVs['R_sl score'], s=100, color='darkorange', marker='P',
            label='moderate growth\ndefect', linewidths=.5, edgecolors='k')
ax1.scatter(DEL_SAVs['EVmut score'], DEL_SAVs['R_sl score'], s=80, color='red',    marker='X',
            label='severe growth\ndefect', linewidths=.5, edgecolors='k')

ax2.set_ylim((0,1))
ax2.set_xlabel('PolyPhen-2')
ax2.set_yticklabels([])

ax2.axhline(0.5, color='k', linestyle='--', lw=1, zorder=0)
ax2.axvline(0.5, color='k', linestyle='--', lw=1, zorder=0)
ax2.scatter(PP2_score, rhapsody_pprob, marker='.')
ax2.scatter(NEU_SAVs_rh['PP2 score'], NEU_SAVs_rh['R_sl score'], s=80,
            color='green', marker='o', label='NEU',
            linewidths=.5, edgecolors='k')
ax2.scatter(INC_SAVs_rh['PP2 score'], INC_SAVs_rh['R_sl score'], s=80,
            color='navy', marker='^', label='increased\ngrowth',
            linewidths=.5, edgecolors='k')
ax2.scatter(INT_SAVs_rh['PP2 score'], INT_SAVs_rh['R_sl score'], s=100,
            color='darkorange', marker='P', label='INT',
            linewidths=.5, edgecolors='k')
ax2.scatter(DEL_SAVs_rh['PP2 score'], DEL_SAVs_rh['R_sl score'], s=80,
            color='red', marker='X', label='DEL',
            linewidths=.5, edgecolors='k')

handles = ax1.get_legend_handles_labels()[0]
labels = ax1.get_legend_handles_labels()[1]
handles.insert(1, ax2.get_legend_handles_labels()[0][1])
labels.insert(1, ax2.get_legend_handles_labels()[1][1])
ax1.set_zorder(1)
ax1.legend(handles, labels, fontsize=13, handletextpad=0.2,
           ncol=2, loc=(0.695, 0.71), framealpha=1.0)

plt.savefig('local/scatter_plot-classic.png', dpi=300)


# In[17]:


EVmut_score = rh_ref_EVmut.getPredictions()['EVmutation score']
rhapsody_pprob = rh_ref_EVmut.getPredictions()['path. prob.']

from scipy.stats import spearmanr

sel = ~np.isnan(rhapsody_pprob)
rho = spearmanr(rhapsody_pprob[sel], EVmut_score[sel])

print(f'Spearman corr. Rhapsody+EVmut. vs EVmutation: {rho[0]:5.3f}')


# In[18]:


import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16, 'font.family': 'Arial'})

fig, axes = plt.subplots(1, 2, figsize=(12,7),
                         gridspec_kw={'width_ratios': [10, 4]})
plt.subplots_adjust(wspace=0.05)

ax1, ax2 = axes
ax1.scatter(EVmut_score, rhapsody_pprob, marker='.')
ax1.axvline(EVmut_cutoff, color='k', linestyle='--', lw=1, zorder=0)
ax1.axhline(0.5, color='k', linestyle='--', lw=1, zorder=0)

ax1.set_ylim((0,1))
ax1.set_xlabel('EVmutation score')
ax1.set_ylabel('Rhapsody+EVmut. path. probability')

ax1.scatter(CTRL_SAVs['EVmut score'], CTRL_SAVs['R+E_sl score'], s=80, color='k', marker='^',
            label='positive\ncontrol', linewidths=.5, edgecolors='k')
ax1.scatter(NEU_SAVs['EVmut score'], NEU_SAVs['R+E_sl score'], s=80, color='green', marker='o',
            label='no growth\ndefect', linewidths=.5, edgecolors='k')
ax1.scatter(INT_SAVs['EVmut score'], INT_SAVs['R+E_sl score'], s=100, color='darkorange', marker='P',
            label='moderate growth\ndefect', linewidths=.5, edgecolors='k')
ax1.scatter(DEL_SAVs['EVmut score'], DEL_SAVs['R+E_sl score'], s=80, color='red', marker='X',
            label='severe growth\ndefect', linewidths=.5, edgecolors='k')

ax1.legend(fontsize=13, handletextpad=0.2)

ax2.axis('off')

plt.savefig('local/scatter_plot-EVmut.png', dpi=300)


# In[19]:


EVmut_score = rh_ref_classic.getPredictions()['EVmutation score']
EVmut_score = EVmut_score[~np.isnan(EVmut_score)]
rhapsody_pprob = rh_ref_classic.getPredictions()['path. prob.']
rhapsody_pprob = rhapsody_pprob[~np.isnan(rhapsody_pprob)]
PP2_score = rh_ref_classic.getPredictions()['PolyPhen-2 score']

fig, axes = plt.subplots(1, 2, figsize=(16,8))

ax = axes[0]
ax2 = ax.twinx()
ax.hist(rhapsody_pprob, density=True, bins='auto',
        color='red', alpha=.7, label='Rhapsody')
ax2.hist(PP2_score, density=True, bins='auto',
         color='blue', alpha=.7, label='PolyPhen-2')
ax.axvline(x=.5, ls='--', color='k')
ax.set_xlabel('predicted score')
ax.set_ylabel('Rhapsody')
ax2.set_ylabel('PolyPhen-2')

handles = ax.get_legend_handles_labels()[0]
labels = ax.get_legend_handles_labels()[1]
handles.extend(ax2.get_legend_handles_labels()[0])
labels.extend(ax2.get_legend_handles_labels()[1])
ax.legend(handles, labels)

axr = axes[1]
axr.hist(EVmut_score, density=True, bins='auto',
         color='green', alpha=.7, label='EVmutation')
axr.set_ylabel('EVmutation')
axr.set_xlabel('predicted score')
axr.axvline(x=EVmut_cutoff, ls='--', color='k')
axr.legend()

fig.tight_layout()
fig.savefig('local/distributions.png', dpi=300)


# In[20]:


import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 18, 'font.family': 'Arial'})

seq_pos = rh_ref_classic.getUniqueSAVcoords()['position']
seq_pos = seq_pos.reshape((-1, 19))
seq_pos = seq_pos[:,0]

resavg_pprob = rh_ref_classic.getResAvgPredictions()['path. prob.']

for rng in [(37,202), (202, 367)]:
    fig, axes = plt.subplots(
        2, 2, figsize=(20,10), 
        gridspec_kw={'width_ratios': [10, 3], 'height_ratios': [2, 10]})
    plt.subplots_adjust(wspace=0.05)
    plt.subplots_adjust(hspace=0.05)

    ax_pp = axes[0,0]
    axes[0,1].axis('off')
    ax = axes[1,0]
    ax_leg = axes[1,1]
    ax_leg.axis('off')

    ax_pp.axhline(y=.5, color='gray', ls=':', zorder=0)
    ax_pp.plot(seq_pos, resavg_pprob, lw=2, label='Rhapsody\npath. prob.')
    ax_pp.set_xlim(rng)
    ax_pp.set_ylim((0, 1))
    ax_pp.set_xticklabels([])
    handles = ax_pp.get_legend_handles_labels()[0]
    labels = ax_pp.get_legend_handles_labels()[1]
    axes[0,1].legend(handles, labels, loc='upper left')

    for i,f in enumerate(rh_ref_classic.featSet):
        f_arr = rh_ref_classic.featMatrix[:,i]
        # residue-average
        f_mtx = f_arr.reshape((-1, 19))
        f_resavg = np.nanmean(f_mtx, axis=1)
        # normalize
        f_resavg -= np.nanmean(f_resavg)
        f_resavg /= 5*np.nanstd(f_resavg)
        # plot
        ax.plot(seq_pos, f_resavg + 9 - i, label=f.replace('-sliced', ''))

    ax.set_ylim((-1, 10))
    ax.set_xlim(rng)
    ax.set_xlabel('residue number')
    ax.set_yticks([])

    handles = ax.get_legend_handles_labels()[0]
    labels = ax.get_legend_handles_labels()[1]
    ax_leg.legend(handles, labels, title="Random Forest features:",
                  loc='upper left', labelspacing=1.2)

    for i, row in df.iterrows():
        pos = int(row['oSAV'].split()[1])
        phenotype = row['exp. fitness']
        if phenotype == 'INC':
            if row['group'] == 0:
                color = 'black'
            else:
                color = 'navy'
        elif phenotype == 'NEU':
            color = 'green'
        elif phenotype == 'INT':
            color = 'darkorange'
        else:
            assert phenotype == 'DEL'
            color = 'red'
        ax_pp.axvline(x=pos, ls='--', lw=2, color=color)
        ax.axvline(x=pos, ls='--', lw=2, color=color)

    plt.savefig('local/features-{}_{}.png'.format(*rng), dpi=300)