This notebook attempts to replicate the Machine Learning pipeline of the following paper:
import sys, os
sys.path.append('code/scripts')
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from mrmr.pandas import mrmr_classif
from verify import check_data
Assign the name of the CSV file containing your PPMI data to the data
variable. Additionaly, enter cohort information in the cohort
list:
Specify the directory with cohort details. Ensure the following for each cohort:
cohort.csv: Must have columns - PATNO, EVENT_ID, Description.
demographics.csv: Should include columns - PATNO, Age, Stage, SEX, output, UPDRS_TOT (optional).
data = "radiomics_results.csv"
cohorts = [
"cohorts/max_cohorts/cohort_max_off_2_35_matched",
"cohorts/max_cohorts/cohort_max_off_off_65_75_unmatched",
"cohorts/max_cohorts/cohort_max_both_2_59_matched",
"cohorts/max_cohorts/cohort_max_both_102_92_unmatched",
"cohorts/max_cohorts/cohort_max_on_2_43_matched",
"cohorts/max_cohorts/cohort_max_on_on_66_81_unmatched",
"cohorts/max_cohorts/cohort_max_2_72_matched"
]
# Verify data file
assert os.path.exists(data), f"{data} does not exist."
# Verify data
assert all(check_data(cohort) for cohort in cohorts), "Not all cohorts have valid data files."
def get_df(data, cohort, demographics, useUPDRS=False):
"""
Merge and preprocess data from multiple CSV files to create a consolidated dataframe.
Parameters:
- data (str): The path to the primary data CSV file.
- cohort (str): The path to the 'cohort.csv' file containing cohort information.
- demographics (str): The path to the 'demographics*.csv' file containing demographic information.
- useUPDRS (bool, optional): Whether to include the 'UPDRS_TOT' column in the output dataframe. Defaults to False.
Returns:
- pandas.DataFrame: A consolidated dataframe containing merged data from the specified files.
"""
# Collect data files
data_df = pd.read_csv(data)
cohort_df = pd.read_csv(cohort)
demographics_df = pd.read_csv(demographics)
# Create output dataframe
columnsToDrop = ["EVENT_ID", "Description", "Age", "Stage", "SEX", "UPDRS_TOT"] if useUPDRS else ["EVENT_ID", "Description", "Age", "Stage", "SEX"]
output_df = cohort_df \
.merge(demographics_df, on="PATNO") \
.drop(columnsToDrop, axis=1)
# Merge together
df = data_df \
.merge(output_df, on="PATNO") \
.drop(["Unnamed: 0",], axis=1)
return df
def getModel(model):
"""
Helper function that returns a machine learning model and its parameters for tuning based on the top parameters from Shu et al. models.
Parameters:
- model (str): The type of machine learning model to be returned. Options: "SVM", "DecisionTree", "kNN", "GNB".
Returns:
- tuple: A tuple containing the machine learning model (classifier) and a dictionary of hyperparameter grid for tuning.
"""
prefix = "train__"
if model == "SVM":
param_grid = {f'{prefix}C': [10, 100, 1000],
f'{prefix}gamma': [0.1, 0.01, 0.001],
f'{prefix}kernel': ['rbf'],
f'{prefix}class_weight': [None, 'balanced']}
clf = SVC(probability=True)
elif model == "DecisionTree":
param_grid = {f'{prefix}max_depth': [4, 5, 6, 8],
f'{prefix}max_leaf_nodes': list(range(5, 20, 1)),
f'{prefix}min_samples_split': [5, 8, 16],
f'{prefix}class_weight': [None, 'balanced']}
clf = DecisionTreeClassifier()
elif model == "kNN":
param_grid = {f'{prefix}n_neighbors': list(range(5, 20)),
f'{prefix}p': [1, 2],
f'{prefix}weights': ["uniform", "distance"]}
clf = KNeighborsClassifier()
elif model == "GNB":
param_grid = {f'{prefix}var_smoothing': np.logspace(0, -9, num=100)}
clf = GaussianNB()
param_grid[f'pca__n_components'] = [2, 3, 5, 7, 10]
return clf, param_grid
class MRMRFeatureSelector(BaseEstimator, TransformerMixin):
"""
Custom scikit-learn transformer for feature selection using MRMR (Maximum Relevance, Minimum Redundancy) algorithm.
Parameters:
- K (int, optional): The number of top features to select. Defaults to 7.
- n_jobs (int, optional): The number of CPU cores to use for parallel computation. Defaults to -1 (using all available cores).
"""
def __init__(self, K=7, n_jobs=-1):
self.K = K
self.n_jobs = n_jobs
def fit(self, X, y):
"""
Fit the MRMR feature selector to the input data.
Parameters:
- X (pandas.DataFrame): The feature matrix.
- y (pandas.Series): The target variable.
Returns:
- self (MRMRFeatureSelector): The fitted transformer object.
"""
self.selected_features_ = mrmr_classif(X, y, K=self.K, n_jobs=self.n_jobs)
return self
def transform(self, X):
"""
Transform the input data by selecting the relevant features.
Parameters:
- X (pandas.DataFrame): The feature matrix.
Returns:
- pandas.DataFrame: The transformed dataframe containing only the selected features.
"""
return X[self.selected_features_]
def nested_cross_validation(df, clf, param_grid, usePCA=True):
"""
Perform nested cross-validation for model evaluation.
Parameters:
- df (pandas.DataFrame): The input dataframe containing features and target variable.
- clf (BaseEstimator): The machine learning classifier to be evaluated.
- param_grid (dict): The hyperparameter grid for the grid search.
- usePCA (bool, optional): Flag indicating whether to include PCA in the pipeline. Defaults to True.
Returns:
- tuple: A tuple containing the fitted grid search object and an array of ROC AUC scores from outer cross-validation.
"""
# Get data
features = df.loc[:, ~df.columns.isin(['PATNO', 'output'])].columns
X = df[features]
y = df["output"]
# Define outer and inner cross-validation
cv_outer = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
cv_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Define pipeline with or without PCA
if usePCA:
pipeline = Pipeline([
('scale', StandardScaler()),
('pca', PCA()),
('train', clf)
])
else:
del param_grid["pca__n_components"]
pipeline = Pipeline([
('scale', StandardScaler().set_output(transform="pandas")),
('mrmr', MRMRFeatureSelector(K=7)),
('train', clf)
])
# Train model using grid search
grid = GridSearchCV(pipeline, param_grid, scoring='roc_auc', cv=cv_inner, n_jobs=1, verbose=0, refit=True)
scores = cross_val_score(grid, X, y, scoring='roc_auc', cv=cv_outer, n_jobs=-1, verbose=0)
return grid, scores
def run():
"""
Perform nested cross-validation for multiple cohorts and machine learning models, recording results in a dataframe.
Returns:
- pandas.DataFrame: A dataframe containing the mean and standard deviation of ROC AUC scores for each cohort,
model type, and feature selection method.
"""
results_df = pd.DataFrame(columns=["SVM", "Decision Tree", "kNN", "GNB"])
for cohort in cohorts:
# Skip missing files
if not check_data(cohort):
print(f"Skipping {cohort}")
continue
# Get data
df = get_df(data, f"{cohort}/cohort.csv", glob.glob(f"{cohort}/demographics*")[0])
cohortName = cohort.split("/")[-1]
# Define features
features = df.loc[:, ~df.columns.isin(['PATNO'])].columns
df_features = df[features]
print(f"COHORT: {cohortName}\n========================================")
for MODEL_TYPE in ["SVM", "DecisionTree", "kNN", "GNB"]:
print(f"\tTraining the following model: {MODEL_TYPE}")
# Define classifier
clf, param_grid = getModel(MODEL_TYPE)
# Train
_, scores_pca = nested_cross_validation(df_features, clf, param_grid, usePCA=True)
_, scores_mrmr = nested_cross_validation(df_features, clf, param_grid, usePCA=False)
if MODEL_TYPE == "SVM":
results_df.loc[f"PCA_{cohortName}", "SVM"] = f"{round(scores_pca.mean(), 3)} +/- {round(scores_pca.std(), 3)}"
if MODEL_TYPE == "DecisionTree":
results_df.loc[f"PCA_{cohortName}", "Decision Tree"] = f"{round(scores_pca.mean(), 3)} +/- {round(scores_pca.std(), 3)}"
if MODEL_TYPE == "kNN":
results_df.loc[f"PCA_{cohortName}", "kNN"] = f"{round(scores_pca.mean(), 3)} +/- {round(scores_pca.std(), 3)}"
if MODEL_TYPE == "GNB":
results_df.loc[f"PCA_{cohortName}", "GNB"] = f"{round(scores_pca.mean(), 3)} +/- {round(scores_pca.std(), 3)}"
if MODEL_TYPE == "SVM":
results_df.loc[f"MRMR_{cohortName}", "SVM"] = f"{round(scores_mrmr.mean(), 3)} +/- {round(scores_mrmr.std(), 3)}"
if MODEL_TYPE == "DecisionTree":
results_df.loc[f"MRMR_{cohortName}", "Decision Tree"] = f"{round(scores_mrmr.mean(), 3)} +/- {round(scores_mrmr.std(), 3)}"
if MODEL_TYPE == "kNN":
results_df.loc[f"MRMR_{cohortName}", "kNN"] = f"{round(scores_mrmr.mean(), 3)} +/- {round(scores_mrmr.std(), 3)}"
if MODEL_TYPE == "GNB":
results_df.loc[f"MRMR_{cohortName}", "GNB"] = f"{round(scores_mrmr.mean(), 3)} +/- {round(scores_mrmr.std(), 3)}"
return results_df
results_df = run()
results_df
COHORT: cohort_max_off_2_35_matched ======================================== Training the following model: SVM Training the following model: DecisionTree Training the following model: kNN Training the following model: GNB COHORT: cohort_max_off_off_65_75_unmatched ======================================== Training the following model: SVM Training the following model: DecisionTree Training the following model: kNN Training the following model: GNB COHORT: cohort_max_both_2_59_matched ======================================== Training the following model: SVM Training the following model: DecisionTree Training the following model: kNN Training the following model: GNB COHORT: cohort_max_both_102_92_unmatched ======================================== Training the following model: SVM Training the following model: DecisionTree Training the following model: kNN Training the following model: GNB COHORT: cohort_max_on_2_43_matched ======================================== Training the following model: SVM Training the following model: DecisionTree Training the following model: kNN Training the following model: GNB COHORT: cohort_max_on_on_66_81_unmatched ======================================== Training the following model: SVM Training the following model: DecisionTree Training the following model: kNN Training the following model: GNB COHORT: cohort_max_2_72_matched ======================================== Training the following model: SVM Training the following model: DecisionTree Training the following model: kNN Training the following model: GNB
SVM | Decision Tree | kNN | GNB | |
---|---|---|---|---|
PCA_cohort_max_off_2_35_matched | 0.617 +/- 0.244 | 0.597 +/- 0.198 | 0.529 +/- 0.184 | 0.566 +/- 0.181 |
PCA_cohort_max_off_off_65_75_unmatched | 0.541 +/- 0.137 | 0.6 +/- 0.142 | 0.526 +/- 0.186 | 0.527 +/- 0.098 |
PCA_cohort_max_both_2_59_matched | 0.586 +/- 0.189 | 0.515 +/- 0.089 | 0.502 +/- 0.152 | 0.415 +/- 0.109 |
PCA_cohort_max_both_102_92_unmatched | 0.458 +/- 0.112 | 0.553 +/- 0.091 | 0.504 +/- 0.06 | 0.553 +/- 0.134 |
PCA_cohort_max_on_2_43_matched | 0.506 +/- 0.173 | 0.495 +/- 0.146 | 0.545 +/- 0.14 | 0.486 +/- 0.192 |
PCA_cohort_max_on_on_66_81_unmatched | 0.531 +/- 0.089 | 0.519 +/- 0.07 | 0.518 +/- 0.066 | 0.48 +/- 0.134 |
PCA_cohort_max_2_72_matched | 0.405 +/- 0.099 | 0.461 +/- 0.105 | 0.467 +/- 0.105 | 0.509 +/- 0.082 |