This code performs four main tasks:
[First part]
data/APOGEE2_2022
folder.mitimpact_features.csv
stored in a newly created extracted_data
folder.[Seconda part]
Note that several hours are required to execute the entire notebook and that some of the hyperparameters have been dropped from the grid-search procedure in order to speed up the computation. The notebook will create a test/
folder to store models and predictions (if test/
is not empty its content will be overwritten). After completing the cross-validation procedure, the notebook will calculate the mean auROC and its confidence intervals.
data/
MitImpact_db_3.0.6.1.txt
: MitImpact file; it contains all possible nonsynonymous mitochondrial SNVs
mtmam.csv
: MtMam substitution matrix for mitochondrial amino acid changes
ddg_dict.pk
: pickle file encoding a Python dictionary for the precomputed ΔΔG values (ΔG_mutant - ΔG_wildtype) resulting from the amino acid changes
AA3Dposition_aligned.csv
: 3D coordinates of the amino acids in the mitochondrial proteins
dataset_21-04-21.csv
: labels in the reference dataset (P
and N
indicate respectively that a variant is pathogenic or neutral); only SNVs are reported in the file
extracted_data/
mitimpact_features.csv
: features extracted from MitImpact and used to annotateall SNVs
n.b., extracted_data/
is created in the section 'Create of a list containing the names of features to extract from the mitimpact
dataframe' below
import pandas as pd
import numpy as np
import pickle as pk
import os, warnings
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.base import clone
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import SelectFromModel
from imblearn.pipeline import Pipeline
if os.path.exists("./data/APOGEE2_2022/"):
PATH= "."
else: # not os.path.exists("./playgrounds/data/APOGEE2_2022/"):
!git clone https://github.com/mazzalab/playgrounds.git
PATH="./playgrounds"
mitimpact = pd.read_csv(PATH+"/data/APOGEE2_2022/MitImpact_db_3.0.6.1.txt", sep="\t", low_memory=False, index_col="MitImpact_id")
# Rename the column Start to Pos
mitimpact.rename(columns={"Start": "Pos"}, inplace=True)
# Replace "." with NaN values
mitimpact.replace(".", np.nan, inplace=True)
mitimpact.head()
mtmam = pd.read_csv(PATH+"/data/APOGEE2_2022/mtmam.csv", sep="\t", index_col=0)
mtmam.head()
# Annotate the mitimpact variants with mtMam scores
mitimpact["mtmam"] = mtmam.stack().loc[mitimpact.set_index(["AA_ref", "AA_alt"]).index].to_numpy()
mitimpact[["mtmam"]].head()
ddg = pd.DataFrame.from_dict(pk.load(open(PATH+"/data/APOGEE2_2022/ddg_dict.pk", "rb")), orient="index").stack().to_frame()
# Parse the dataframe
ddg = pd.DataFrame(ddg[0].values.tolist(), index=ddg.index).stack().to_frame()
# Rename the column of the scores as "ddg"
ddg.columns = ["ddg"]
# Reset index
ddg.reset_index(inplace=True)
# Parse the information about the position of AA in the protein
ddg["AA_pos"] = ddg.level_1.str.split(r"[A-Z]", expand=True)[1].astype(float)
# Parse the information about the reference AA
ddg["level_1"] = ddg.level_1.str.split("", expand=True)[1]
# Rename the columns
ddg.rename(columns={"level_0":"Gene_symbol","level_2":"AA_alt","level_1":"AA_ref"}, inplace=True)
# Annotate variants in MitImpact with the ddg values (from the `ddg` dataframe)
mitimpact["ddg"] = pd.merge(
mitimpact[["Gene_symbol","AA_pos","AA_ref","AA_alt"]],
ddg,
on=["Gene_symbol","AA_pos","AA_ref","AA_alt"],
how="left",
).ddg.to_numpy()
mitimpact[["ddg"]].head()
coords3D = pd.read_csv(PATH+"/data/APOGEE2_2022/AA3Dposition_aligned.csv", sep="\t", header=None)
# Rename the columns appropriately
coords3D.columns=["Gene_symbol", "AA_pos", "X", "Y", "Z"]
# Annotate the variants in MitImpact with X, Y, and Z values corresponding to the 3D coordinates of the amino acids.
mitimpact[["X", "Y", "Z"]] = pd.merge(
mitimpact[["Gene_symbol","AA_pos"]],
coords3D,
on=["Gene_symbol","AA_pos"],
how="left",
)[["X", "Y", "Z"]].to_numpy()
mitimpact[["X", "Y", "Z"]].head()
mitimpact
dataframe¶features = [
"PhyloP_100V", "PhastCons_100V", "PolyPhen2_score", "SIFT_score", "FatHmmW_score",
"FatHmm_score", "PROVEAN_score", "MutationAssessor_score", "EFIN_SP_score", "EFIN_HD_score",
"CADD_phred_score", "VEST_pvalue", "PANTHER_score", "PhD-SNP_score", "SNAP_score", "MutationTaster_score",
"mtmam", "ddg", "Pos", "X", "Y", "Z"
]
# Cast the variables into float values.
mitimpact[features] = mitimpact[features].astype(float)
# Save the dataframe obtained.
try:
os.mkdir(PATH+"/data/APOGEE2_2022/extracted_data/")
except FileExistsError:
print("The folder exists. Data will be overwritten")
mitimpact[features].to_csv(PATH+"/data/APOGEE2_2022/extracted_data/mitimpact_features.csv", sep="\t")
mitimpact[features].head()
import shutil
def _empty_or_create(folder_path):
if(os.path.exists(folder_path)):
warnings.warn(folder_path+" is not empty. It will be emptied")
shutil.rmtree(folder_path)
os.mkdir(folder_path)
# Existence file check
if not 'dataset_21-04-21.csv' in os.listdir(PATH+"/data/APOGEE2_2022/"):
raise FileNotFoundError(f"Cannot find 'dataset_21-04-21.csv' in folder {PATH}/data/APOGEE2_2022/")
elif not 'mitimpact_features.csv' in os.listdir(PATH+"/data/APOGEE2_2022/extracted_data"):
raise FileNotFoundError(f"Cannot find 'mitimpact_features.csv' in folder {PATH}/data/APOGEE2_2022/. Please ensure you have executed the first part of the code above")
else:
print("Required files present")
Read the classes of training-set variants:
0
means Benign variants;1
means Pathogenic variants.The index values are the MitImpact_ID
of the variants.
Y = pd.read_csv(PATH+"/data/APOGEE2_2022/dataset_21-04-21.csv", sep="\t", index_col="MitImpact_ID", comment="#").Class
Y.replace(["N", "P"], [0, 1], inplace=True)
# Target Vector
Y.head()
Annotate the training-set variants (Y
vector) with the features extracted from the mitimpact_features.csv
file (previously generated in the First part)
X = pd.read_csv(PATH+"/data/APOGEE2_2022/extracted_data/mitimpact_features.csv", sep="\t", index_col="MitImpact_id").loc[Y.index]
# Feature Matrix
X.head()
#@title ##Settings
#@markdown <h2>**SETTINGS**</h2>
#@markdown
#@markdown choose the machine learning classifier
method = 'KNN_RusSmote' #@param ["BalancedRF", "GNB_BalancedBagging", "KNN_BalancedBagging", "KNN_RusSmote", "rbfSVC", "RusSmoteForest"]
#@markdown
#@markdown select the number of splits for the test cross-validation (outer cv splits)
n_splits = 10 #@param {type:"slider", min:5, max:20, step:1}
#@markdown
#@markdown select the number of different partitions for the test cross-validation (outer cv repetitions)
n_repeats = 2 #@param {type:"slider", min:1, max:10, step:1}
#@markdown <h7>**Attention:**</h7>
#@markdown <h7><i>For computational limit, please ensure that `n_splits * n_repeats` is not greater than 60</i></h7>
#@markdown
#@markdown select the number of splits for the grid-search cross-validation (inner cv splits)
gridsearc_cv_splits = 5 #@param {type:"slider", min:5, max:19, step:1}
#@markdown
#@markdown type an integer number as random state (used for the partitions in both inner and outer cv)
random_state = 118 #@param {type:"integer"}
kf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
#@markdown
#@markdown check if you want to have real-time feedbacks during the training, validation and test (recommended for debugging)
verbose = True #@param {type:"boolean"}
# Define the three sequential preprocessing steps: standardize the features, impute the missing values, remove the low importance features.
preprocessing_pipeline = Pipeline([
("scaler", StandardScaler()),
("imputer", IterativeImputer(RandomForestRegressor(n_jobs=-1), tol=0.05, verbose=verbose)),
("feature_selection", SelectFromModel(DecisionTreeClassifier(criterion="entropy", class_weight="balanced"), threshold=0.01)),
])
# Initialize the classifier according to the settings
# In addition hyper-parameters grid is initialized according to the `method` chosen in the settings
def init_model_and_params():
global clf, grid
if method=="rbfSVC":
from sklearn.svm import SVC
clf = SVC(class_weight="balanced", kernel='rbf', probability=True)
grid = ParameterGrid({
"C": [[100], [10], [1], [0.1]],
"gamma": [[0.1], [0.01], [0.001], [0.0001]],
})
return
if method=="KNN_RusSmote":
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
clf = BaggingClassifier(
base_estimator=Pipeline([
("rus", RandomUnderSampler()),
("smote", SMOTE(n_jobs=-1)),
("clf", KNeighborsClassifier(n_jobs=-1))
]),
n_estimators=200,
n_jobs=-1,
verbose=0
)
grid = ParameterGrid({
"base_estimator__clf__n_neighbors": [[3], [5], [7], [9]],
"base_estimator__rus__sampling_strategy": [[0.25], [0.5]],
"max_features": [[0.25], [0.5]],
"base_estimator__clf__weights": [["uniform"], ["distance"]],
})
return
if method=="GNB_BalancedBagging":
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.naive_bayes import GaussianNB
clf = BalancedBaggingClassifier(GaussianNB(), n_estimators=200, n_jobs=-1)
grid = ParameterGrid({
"base_estimator__var_smoothing": [[1e-07], [1e-08], [1e-09], [1e-10], [1e-11]],
"max_features": [[0.25], [0.5], [0.75]],
"max_samples": [[0.25], [0.5], [0.75]],
})
return
if method=="BalancedRF":
from imblearn.ensemble import BalancedRandomForestClassifier
clf = BalancedRandomForestClassifier(n_estimators=200, criterion="entropy", n_jobs=-1)
grid = ParameterGrid({
"max_depth": [[7], [9], [11], [13]],
"max_features": [[0.25], [0.5], [0.75]],
"min_samples_leaf": [[1], [2], [3]],
"min_samples_split": [[2], [4]],
})
return
if method=="RusSmoteForest":
from sklearn.ensemble import BaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
clf = BaggingClassifier(
base_estimator=Pipeline([
("rus", RandomUnderSampler()),
("smote", SMOTE(n_jobs=-1)),
("clf", DecisionTreeClassifier(criterion="entropy"))
]),
n_estimators=200,
n_jobs=-1,
verbose=0
)
grid = ParameterGrid({
"base_estimator__clf__max_depth": [[11], [15], [19]],
"base_estimator__clf__min_samples_leaf": [[1], [2], [3]],
"base_estimator__clf__min_samples_split": [[2], [4]],
"base_estimator__rus__sampling_strategy": [[0.25], [0.5], [0.75]],
"base_estimator__clf__max_features": [[0.25], [0.5], [0.75]],
})
return
if method=="KNN_BalancedBagging":
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
clf = BalancedBaggingClassifier(KNeighborsClassifier(), n_estimators=200, n_jobs=-1)
grid = ParameterGrid({
"base_estimator__n_neighbors": [[3], [5], [7], [9]],
"max_features": [[0.25], [0.5], [0.75]],
"base_estimator__weights": [["distance"]],
"max_samples": [[0.25], [0.5], [0.75]],
})
return
raise ValueError("%s is not a valid method"%method)
init_model_and_params()
# Define the grid-search object according to the classifier and hyper-paramethers grid
gridsearch_clf = GridSearchCV(
estimator=clf,
param_grid=grid,
scoring="roc_auc",
cv=StratifiedKFold(gridsearc_cv_splits, shuffle=True, random_state=random_state),
n_jobs=-1,
error_score="raise",
verbose=verbose,
)
if n_repeats*n_splits > 60:
raise ValueError("`n_repeats*n_splits` expected to be <=60, but %s * %s == %s.\nI now rise this `ValueError` in order to avoid a future `IOError`."%(n_repeats, n_splits, n_repeats*n_splits))
#@markdown Folder where data from test will be stored.
TEST_PATH = PATH + "/data/APOGEE2_2022/test" #@param {type:"string"}
For each cross-validation iteration:
_empty_or_create(TEST_PATH)
_empty_or_create(TEST_PATH+"/preprocessing/")
for i, (train, test) in enumerate(kf.split(X, Y)):
preprocessing_pipeline.fit(X.iloc[train], Y[train])
pk.dump(preprocessing_pipeline, open(TEST_PATH+"/preprocessing/fold_%i.pk"%i, "wb"))
The following code:
GridSearchCV
object) results in the test/classifier
foldertest/predictions
folderdef test_on_fold(i, train, test):
fitted_preprocessing = pk.load(open(TEST_PATH+"/preprocessing/fold_%i.pk"%i, "rb"))
X_train = fitted_preprocessing.transform(X.iloc[train])
X_test = fitted_preprocessing.transform(X.iloc[test])
tuned_classifier = clone(gridsearch_clf).fit(X_train, Y[train])
pk.dump(tuned_classifier, open(TEST_PATH+"/classifier/fold_%i.pk"%i, "wb"))
pk.dump(tuned_classifier.predict_proba(X_test)[:,1], open(TEST_PATH+"/predictions/fold_%i.pk"%(i), "wb"))
_empty_or_create(TEST_PATH+"/classifier/")
_empty_or_create(TEST_PATH+"/predictions/")
# For each cross-validation iteration train the classifier as explained in the `test_on_fold` function
for i, (train, test) in enumerate(kf.split(X, Y)):
print(f"############################## TEST {i} ################################")
test_on_fold(i, train, test)
Y_pred = [pk.load(open(TEST_PATH+"/predictions/fold_%i.pk"%i, "rb")) for i in range(kf.get_n_splits())]
Y_true = [Y[test] for _, test in kf.split(X, Y)]
len(Y_pred), len(Y_true)
AUCs = np.array([roc_auc_score(t, y) for t, y in zip(Y_true, Y_pred)])
plt.boxplot(AUCs)
plt.ylim(.5,1)
plt.xticks([])
plt.ylabel("test AUC")
plt.show()
#@markdown Significance level:
alpha=0.95 #@param {type:"slider", min:0.5, max:0.995, step:0.005}
AUC_mean_distribution = stats.norm(loc=AUCs.mean(), scale=AUCs.std(ddof=1)/np.sqrt(len(Y_pred)))
CI = AUC_mean_distribution.interval(alpha)
_bins = np.linspace(.5,1, 1001)
plt.plot(_bins, AUC_mean_distribution.pdf(_bins))
plt.vlines(
AUCs.mean(), 0, AUC_mean_distribution.pdf(AUCs.mean()),
linestyle="--", label="mean AUC = %.4f"%AUCs.mean()
)
plt.fill_between(
_bins[(_bins>CI[0])&(_bins<CI[1])],
AUC_mean_distribution.pdf(_bins[(_bins>CI[0])&(_bins<CI[1])]),
alpha=.25, label="CI = (%.4f, %.4f)"%(CI[0], CI[1])
)
plt.xlim(AUCs.min(), 1)
plt.xlabel("mean AUC")
plt.ylabel("PDF")
plt.ylim(0,None)
plt.legend()
plt.show()
external_predictors = ["Meta-SNP_score","CAROL_score","Condel_score","COVEC_WMV_score","MtoolBox_DS","APOGEE_score"]
thresholds = np.array([.5, .98, .5, 0, .43, .5])
len(external_predictors), len(thresholds)
extarnal_tools_predictions = mitimpact.loc[Y.index, external_predictors].replace(".", np.nan).astype(float)
extarnal_tools_predictions["Condel_score"] = -extarnal_tools_predictions["Condel_score"]
extarnal_tools_predictions.head()
with plt.style.context('bmh'):
plt.figure(figsize=(10,10))
fpr_bins = np.linspace(0,1,51)
for predictor in external_predictors:
fpr, tpr, _ = roc_curve(Y[~extarnal_tools_predictions[predictor].isna()], extarnal_tools_predictions[predictor].dropna())
plt.plot(
[0]+list(fpr), [0]+list(tpr),
linewidth=2, alpha=.75, linestyle="--",
label="%s (mean AUC=%.4f)"%(predictor, roc_auc_score(Y[~extarnal_tools_predictions[predictor].isna()], extarnal_tools_predictions[predictor].dropna()))
)
pr_bins = np.linspace(0,1,51)
tprs = []
for y_true, y_pred in zip(Y_true, Y_pred):
fpr, tpr, _ = roc_curve(y_true, y_pred)
tprs.append(np.interp(fpr_bins, fpr, tpr))
tprs = np.array(tprs)
plt.plot(
[0]+list(fpr_bins), [0]+list(tprs.mean(0)),
linewidth=3, alpha=.75, color="k",
label="APOGEE2_score (mean AUC=%.4f)"%(AUCs.mean()))
plt.plot([0,1], [0,1], "k:", alpha=.5, label="chance")
plt.xticks(np.linspace(0,1,11))
plt.yticks(np.linspace(0,1,11))
plt.legend(loc="lower right")
plt.ylim(-.025,1.025)
plt.xlim(-.025,1.025)
plt.xlabel("false positive rate")
plt.ylabel("true positive rate")
plt.legend()
plt.show()
with plt.style.context('bmh'):
plt.figure(figsize=(10,10))
recall_bins = np.linspace(0,1,51)
for predictor in external_predictors:
precision,recall, thresholds = precision_recall_curve(Y[~extarnal_tools_predictions[predictor].isna()], extarnal_tools_predictions[predictor].dropna())
avarage_PS=average_precision_score(Y[~extarnal_tools_predictions[predictor].isna()], extarnal_tools_predictions[predictor].dropna())
plt.plot(
[0]+list(recall), [1]+list(precision),
linewidth=2, alpha=.75, linestyle="--",
label="%s (average precision=%.4f)"%(predictor,avarage_PS)
)
##########
precision_apo2, recall_apo2, _ = precision_recall_curve(np.concatenate(Y_true), np.concatenate(Y_pred))
avarage_PS_apo2 = average_precision_score(np.concatenate(Y_true), np.concatenate(Y_pred))
plt.plot(
list(recall_apo2), list(precision_apo2),
linewidth=3, alpha=.75, color="k",
label="APOGEE2_score (average precision=%.4f)"%(avarage_PS_apo2))
plt.xticks(np.linspace(0,1,11))
plt.yticks(np.linspace(0,1,11))
plt.legend(loc="lower right")
plt.ylim(-.025,1.025)
plt.xlim(-.025,1.025)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.show()
%load_ext watermark
%watermark -v -m -p numpy,pandas,matplotlib,sklearn,scipy
# date
print(" ")
%watermark -u -n -t -z