#!/usr/bin/env python # coding: utf-8 # #### Data source: # Lee M, Teber ET, Holmes O, Nones K, Patch AM, Dagg RA, Lau LMS, Lee JH, Napier CE, Arthur JW, Grimmond SM, Hayward NK, Johansson PA, Mann GJ, Scolyer RA, Wilmott JS, Reddel RR, Pearson JV, Waddell N, Pickett HA. # **Telomere sequence content can be used to determine ALT activity in tumours.** # _Nucleic Acids Res._ 2018 Jun 1;46(10):4903-4918. # # STEP 1: Preprocessing # ## Load data # In[2]: import pandas as pd # Load data data = pd.read_csv("data/telomere_ALT/telomere.csv", sep='\t') data.head(10) # Show first ten samples # ## Split data into training and test sets # In[3]: from sklearn import model_selection num_row = data.shape[0] # number of samples in the dataset num_col = data.shape[1] # number of features in the dataset (plus the label column) X = data.iloc[:, 0: num_col-1] # feature columns y = data['TMM'] # label column X_train, X_test, y_train, y_test = \ model_selection.train_test_split(X, y, test_size=0.2, # reserve 20 percent data for testing stratify=y, # stratified sampling random_state=0) # In[4]: X_train.shape # In[5]: X_test.shape # In[6]: y_train.value_counts() # In[7]: y_test.value_counts() # # STEP 2: Learning # ## Train a Random Forest classifier # In[8]: import numpy as np from sklearn import ensemble rf = ensemble.RandomForestClassifier( n_estimators = 10, # 10 random trees in the forest criterion = 'entropy', # use entropy as the measure of uncertainty max_depth = 3, # maximum depth of each tree is 3 min_samples_split = 5, # generate a split only when there are at least 5 samples at current node class_weight = 'balanced', # class weight is inversely proportional to class frequencies random_state = 0) k = 3 # number of folds # split data into k folds kfold = model_selection.StratifiedKFold( n_splits = k, shuffle = True, random_state = 0) cv = list(kfold.split(X_train, y_train)) # indices of samples in each fold # In[9]: for j, (train_idx, val_idx) in enumerate(cv): print('validation fold %d:' % (j+1)) print(val_idx) # ## Compute classifier accuracy # In[10]: accuracy = [] # Compute classifier's accuracy on each fold for j, (train_idx, val_idx) in enumerate(cv): rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx]) accuracy_per_fold = rf.score(X_train.iloc[val_idx], y_train.iloc[val_idx]) accuracy.append(accuracy_per_fold) print('Fold %d accuracy: %.3f' % (j+1, accuracy_per_fold)) print('Average accuracy: %.3f' % np.mean(accuracy)) # ## Plot receiver operating characteristic (ROC) curve # In[11]: from matplotlib import pyplot as plt from sklearn import metrics fpr = dict() # false positive rate tpr = dict() # true positive rate auroc = dict() # area under the ROC curve (AUROC) # Compute an ROC curve (with respect to the positive class) for each fold # Compute AUROC for each fold for j, (train_idx, val_idx) in enumerate(cv): rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx]) y_prob = rf.predict_proba(X_train.iloc[val_idx]) fpr[j], tpr[j], _ = metrics.roc_curve(y_train.iloc[val_idx], y_prob[:, 0], pos_label='+') auroc[j] = metrics.auc(fpr[j], tpr[j]) # Compute an average ROC curve for all folds fpr['avg'] = np.unique(np.concatenate([fpr[j] for j in range(k)])) tpr['avg'] = np.zeros_like(fpr['avg']) for j in range(k): tpr['avg'] += np.interp(fpr['avg'], fpr[j], tpr[j]) tpr['avg'] /= k # Compute AUROC of the average ROC curve auroc['avg'] = metrics.auc(fpr['avg'], tpr['avg']) # In[12]: # Plot the ROC curves for j in range(k): plt.plot(fpr[j], tpr[j], label='fold %d (area = %.3f)' % (j+1, auroc[j])) plt.plot(fpr['avg'], tpr['avg'], 'k--', label='mean ROC (area = %.3f)' % auroc['avg'], lw=2) plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6)) # ROC curve for a random classifier plt.plot([0,0,1], [0,1,1], linestyle='--', color=(0.6, 0.6, 0.6)) # ROC curve for a perfect classifier plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('FPR') plt.ylabel('TPR') plt.legend(loc='lower right') plt.show() # ## Plot precision-recall (PR) curve # In[13]: precision_plus = dict() recall_plus = dict() auprc_plus = dict() precision_minus = dict() recall_minus = dict() auprc_minus = dict() # Compute PR curves (with respect to the positive and negative classes) for each fold # Compute AUPRC for each fold for j, (train_idx, val_idx) in enumerate(cv): rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx]) y_prob = rf.predict_proba(X_train.iloc[val_idx]) precision_plus[j], recall_plus[j], _ = metrics.precision_recall_curve(y_train.iloc[val_idx], y_prob[:, 0], pos_label='+') precision_minus[j], recall_minus[j], _ = metrics.precision_recall_curve(y_train.iloc[val_idx], y_prob[:, 1], pos_label='-') auprc_plus[j] = metrics.auc(recall_plus[j], precision_plus[j]) auprc_minus[j] = metrics.auc(recall_minus[j], precision_minus[j]) # Compute average PR curves for all folds recall_plus['avg'] = np.unique(np.concatenate([recall_plus[j] for j in range(k)])) recall_minus['avg'] = np.unique(np.concatenate([recall_minus[j] for j in range(k)])) precision_plus['avg'] = np.zeros_like(recall_plus['avg']) precision_minus['avg'] = np.zeros_like(recall_minus['avg']) for j in range(k): precision_plus['avg'] += np.interp(recall_plus['avg'], np.flip(recall_plus[j], 0), np.flip(precision_plus[j], 0)) precision_minus['avg'] += np.interp(recall_minus['avg'], np.flip(recall_minus[j], 0), np.flip(precision_minus[j], 0)) precision_plus['avg'] /= k precision_minus['avg'] /= k # Compute AUPRCs for the average curves auprc_plus['avg'] = metrics.auc(recall_plus['avg'], precision_plus['avg']) auprc_minus['avg'] = metrics.auc(recall_minus['avg'], precision_minus['avg']) # ### PR curve w.r.t the positive class # In[14]: # Plot the PR curves for the positive class for j in range(k): plt.plot(recall_plus[j], precision_plus[j], label='fold %d (area = %.3f)' % (j+1, auprc_plus[j])) plt.plot(recall_plus['avg'], precision_plus['avg'], 'k--', label='mean PR (area = %.3f)' % auprc_plus['avg'], lw=2) plt.plot([0,1,1], [1,1,0], linestyle='--', color=(0.6, 0.6, 0.6)) # PR curve for a perfect classifier plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('recall') plt.ylabel('precision') plt.legend(loc='lower left') plt.show() # ### PR curve w.r.t the negative class # In[15]: # Plot the PR curves for the negative class for j in range(k): plt.plot(recall_minus[j], precision_minus[j], label='fold %d (area = %.3f)' % (j+1, auprc_minus[j])) plt.plot(recall_minus['avg'], precision_minus['avg'], 'k--', label='mean PR (area = %.3f)' % auprc_minus['avg'], lw=2) plt.plot([0,1,1], [1,1,0], linestyle='--', color=(0.6, 0.6, 0.6)) # PR curve for a perfect classifier plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('recall') plt.ylabel('precision') plt.legend(loc='lower left') plt.show() # ## Plot confusion matrix # In[16]: # Compute a 2x2 confusion matrix confusion_matrix = np.zeros([2, 2]) for j, (train_idx, val_idx) in enumerate(cv): rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx]) y = rf.predict(X_train.iloc[val_idx]) confusion_matrix += metrics.confusion_matrix(y_train.iloc[val_idx], y) # Average over all folds confusion_matrix /= k confusion_matrix = np.around(confusion_matrix, decimals=3) # In[17]: # Plot the confusion matrix plt.matshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues, alpha=0.5) plt.xticks(np.arange(2), ('+', '-')) plt.yticks(np.arange(2), ('+', '-')) for i in range(2): for j in range(2): plt.text(j, i, confusion_matrix[i, j], ha='center', va='center') plt.xlabel('Predicted label') plt.ylabel('True label') plt.show() # # STEP 3: Evaluation # In[18]: # Refit the classifier using the full training set rf = rf.fit(X_train, y_train) # ### Compute classifier accuracy on the test set # In[19]: # Compute classifier's accuracy on the test set test_accuracy = rf.score(X_test, y_test) print('Test accuracy: %.3f' % test_accuracy) # ### Generate plots for the test set # In[20]: # Predict labels for the test set y = rf.predict(X_test) # Solid prediction y_prob = rf.predict_proba(X_test) # Soft prediction # Compute the ROC and PR curves for the test set fpr, tpr, _ = metrics.roc_curve(y_test, y_prob[:, 0], pos_label='+') precision_plus, recall_plus, _ = metrics.precision_recall_curve(y_test, y_prob[:, 0], pos_label='+') precision_minus, recall_minus, _ = metrics.precision_recall_curve(y_test, y_prob[:, 1], pos_label='-') # Compute the AUROC and AUPRCs for the test set auroc = metrics.auc(fpr, tpr) auprc_plus = metrics.auc(recall_plus, precision_plus) auprc_minus = metrics.auc(recall_minus, precision_minus) # Compute the confusion matrix for the test set cm = metrics.confusion_matrix(y_test, y) # In[21]: # Plot the ROC curve for the test set plt.plot(fpr, tpr, label='test (area = %.3f)' % auroc) plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6)) plt.plot([0,0,1], [0,1,1], linestyle='--', color=(0.6, 0.6, 0.6)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('FPR') plt.ylabel('TPR') plt.legend(loc='lower right') plt.title('ROC curve') plt.show() # In[22]: # Plot the PR curve for the test set plt.plot(recall_plus, precision_plus, label='positive class (area = %.3f)' % auprc_plus) plt.plot(recall_minus, precision_minus, label='negative class (area = %.3f)' % auprc_minus) plt.plot([0,1,1], [1,1,0], linestyle='--', color=(0.6, 0.6, 0.6)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('recall') plt.ylabel('precision') plt.legend(loc='lower left') plt.title('Precision-recall curve') plt.show() # In[23]: # Plot the confusion matrix for the test set plt.matshow(cm, interpolation='nearest', cmap=plt.cm.Blues, alpha=0.5) plt.xticks(np.arange(2), ('+', '-')) plt.yticks(np.arange(2), ('+', '-')) for i in range(2): for j in range(2): plt.text(j, i, cm[i, j], ha='center', va='center') plt.xlabel('Predicted label') plt.ylabel('True label') plt.show() # ## STEP 4: Prediction # In[31]: # Enter unlabeled data for prediction newdata = pd.read_csv("data/telomere_ALT/telomere_new.csv", sep='\t') newdata.head(6) # In[32]: # predict labels y_new = rf.predict(newdata) y_new # In[ ]: