#!/usr/bin/env python
# coding: utf-8

# #### Data source:
# Lee M, Teber ET, Holmes O, Nones K, Patch AM, Dagg RA, Lau LMS, Lee JH, Napier CE, Arthur JW, Grimmond SM, Hayward NK, Johansson PA, Mann GJ, Scolyer RA, Wilmott JS, Reddel RR, Pearson JV, Waddell N, Pickett HA. 
# **Telomere sequence content can be used to determine ALT activity in tumours.**
# _Nucleic Acids Res._ 2018 Jun 1;46(10):4903-4918.

# # STEP 1: Preprocessing

# ## Load data

# In[2]:


import pandas as pd

# Load data
data = pd.read_csv("data/telomere_ALT/telomere.csv", sep='\t')
data.head(10)    # Show first ten samples


# ## Split data into training and test sets

# In[3]:


from sklearn import model_selection

num_row = data.shape[0]     # number of samples in the dataset
num_col = data.shape[1]     # number of features in the dataset (plus the label column)

X = data.iloc[:, 0: num_col-1]    # feature columns
y = data['TMM']                   # label column

X_train, X_test, y_train, y_test = \
    model_selection.train_test_split(X, y, 
                                     test_size=0.2,      # reserve 20 percent data for testing
                                     stratify=y,         # stratified sampling
                                     random_state=0)


# In[4]:


X_train.shape


# In[5]:


X_test.shape


# In[6]:


y_train.value_counts()


# In[7]:


y_test.value_counts()


# # STEP 2: Learning

# ## Train a Random Forest classifier

# In[8]:


import numpy as np
from sklearn import ensemble

rf = ensemble.RandomForestClassifier(
    n_estimators = 10,           # 10 random trees in the forest
    criterion = 'entropy',       # use entropy as the measure of uncertainty
    max_depth = 3,               # maximum depth of each tree is 3
    min_samples_split = 5,       # generate a split only when there are at least 5 samples at current node
    class_weight = 'balanced',   # class weight is inversely proportional to class frequencies
    random_state = 0)

k = 3   # number of folds

# split data into k folds
kfold = model_selection.StratifiedKFold(
    n_splits = k,
    shuffle = True,
    random_state = 0)

cv = list(kfold.split(X_train, y_train))    # indices of samples in each fold


# In[9]:


for j, (train_idx, val_idx) in enumerate(cv):
    print('validation fold %d:' % (j+1))
    print(val_idx)


# ## Compute classifier accuracy

# In[10]:


accuracy = []

# Compute classifier's accuracy on each fold
for j, (train_idx, val_idx) in enumerate(cv):
    rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    accuracy_per_fold = rf.score(X_train.iloc[val_idx], y_train.iloc[val_idx])
    accuracy.append(accuracy_per_fold)
    print('Fold %d accuracy: %.3f' % (j+1, accuracy_per_fold))

print('Average accuracy: %.3f' % np.mean(accuracy))


# ## Plot receiver operating characteristic (ROC) curve

# In[11]:


from matplotlib import pyplot as plt
from sklearn import metrics

fpr = dict()       # false positive rate
tpr = dict()       # true positive rate
auroc = dict()     # area under the ROC curve (AUROC)

# Compute an ROC curve (with respect to the positive class) for each fold
# Compute AUROC for each fold
for j, (train_idx, val_idx) in enumerate(cv):
    rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y_prob = rf.predict_proba(X_train.iloc[val_idx])
    fpr[j], tpr[j], _ = metrics.roc_curve(y_train.iloc[val_idx], y_prob[:, 0], pos_label='+')
    auroc[j] = metrics.auc(fpr[j], tpr[j])

# Compute an average ROC curve for all folds
fpr['avg'] = np.unique(np.concatenate([fpr[j] for j in range(k)]))
tpr['avg'] = np.zeros_like(fpr['avg'])

for j in range(k):
    tpr['avg'] += np.interp(fpr['avg'], fpr[j], tpr[j])

tpr['avg'] /= k

# Compute AUROC of the average ROC curve
auroc['avg'] = metrics.auc(fpr['avg'], tpr['avg'])


# In[12]:


# Plot the ROC curves
for j in range(k):
    plt.plot(fpr[j], tpr[j], label='fold %d (area = %.3f)' % (j+1, auroc[j]))

plt.plot(fpr['avg'], tpr['avg'], 'k--', label='mean ROC (area = %.3f)' % auroc['avg'], lw=2)
plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6))     # ROC curve for a random classifier
plt.plot([0,0,1], [0,1,1], linestyle='--', color=(0.6, 0.6, 0.6)) # ROC curve for a perfect classifier

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='lower right')

plt.show()


# ## Plot precision-recall (PR) curve

# In[13]:


precision_plus = dict()
recall_plus = dict()
auprc_plus = dict()

precision_minus = dict()
recall_minus = dict()
auprc_minus = dict()

# Compute PR curves (with respect to the positive and negative classes) for each fold
# Compute AUPRC for each fold
for j, (train_idx, val_idx) in enumerate(cv):
    rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y_prob = rf.predict_proba(X_train.iloc[val_idx])
    precision_plus[j], recall_plus[j], _ = metrics.precision_recall_curve(y_train.iloc[val_idx], y_prob[:, 0], pos_label='+')
    precision_minus[j], recall_minus[j], _ = metrics.precision_recall_curve(y_train.iloc[val_idx], y_prob[:, 1], pos_label='-')
    auprc_plus[j] = metrics.auc(recall_plus[j], precision_plus[j])
    auprc_minus[j] = metrics.auc(recall_minus[j], precision_minus[j])

# Compute average PR curves for all folds
recall_plus['avg'] = np.unique(np.concatenate([recall_plus[j] for j in range(k)]))
recall_minus['avg'] = np.unique(np.concatenate([recall_minus[j] for j in range(k)]))
precision_plus['avg'] = np.zeros_like(recall_plus['avg'])
precision_minus['avg'] = np.zeros_like(recall_minus['avg'])

for j in range(k):
    precision_plus['avg'] += np.interp(recall_plus['avg'], np.flip(recall_plus[j], 0), np.flip(precision_plus[j], 0))
    precision_minus['avg'] += np.interp(recall_minus['avg'], np.flip(recall_minus[j], 0), np.flip(precision_minus[j], 0))

precision_plus['avg'] /= k
precision_minus['avg'] /= k

# Compute AUPRCs for the average curves
auprc_plus['avg'] = metrics.auc(recall_plus['avg'], precision_plus['avg'])
auprc_minus['avg'] = metrics.auc(recall_minus['avg'], precision_minus['avg'])


# ### PR curve w.r.t the positive class

# In[14]:


# Plot the PR curves for the positive class
for j in range(k):
    plt.plot(recall_plus[j], precision_plus[j], label='fold %d (area = %.3f)' % (j+1, auprc_plus[j]))

plt.plot(recall_plus['avg'], precision_plus['avg'], 'k--', label='mean PR (area = %.3f)' % auprc_plus['avg'], lw=2)
plt.plot([0,1,1], [1,1,0], linestyle='--', color=(0.6, 0.6, 0.6))   # PR curve for a perfect classifier

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend(loc='lower left')

plt.show()


# ### PR curve w.r.t the negative class

# In[15]:


# Plot the PR curves for the negative class
for j in range(k):
    plt.plot(recall_minus[j], precision_minus[j], label='fold %d (area = %.3f)' % (j+1, auprc_minus[j]))

plt.plot(recall_minus['avg'], precision_minus['avg'], 'k--', label='mean PR (area = %.3f)' % auprc_minus['avg'], lw=2)
plt.plot([0,1,1], [1,1,0], linestyle='--', color=(0.6, 0.6, 0.6))   # PR curve for a perfect classifier

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend(loc='lower left')

plt.show()


# ## Plot confusion matrix

# In[16]:


# Compute a 2x2 confusion matrix
confusion_matrix = np.zeros([2, 2])

for j, (train_idx, val_idx) in enumerate(cv):
    rf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
    y = rf.predict(X_train.iloc[val_idx])
    confusion_matrix += metrics.confusion_matrix(y_train.iloc[val_idx], y)

# Average over all folds
confusion_matrix /= k
confusion_matrix = np.around(confusion_matrix, decimals=3)


# In[17]:


# Plot the confusion matrix
plt.matshow(confusion_matrix, interpolation='nearest', cmap=plt.cm.Blues, alpha=0.5)
plt.xticks(np.arange(2), ('+', '-'))
plt.yticks(np.arange(2), ('+', '-'))

for i in range(2):
    for j in range(2):
        plt.text(j, i, confusion_matrix[i, j], ha='center', va='center')

plt.xlabel('Predicted label')
plt.ylabel('True label')

plt.show()


# # STEP 3: Evaluation

# In[18]:


# Refit the classifier using the full training set
rf = rf.fit(X_train, y_train)


# ### Compute classifier accuracy on the test set

# In[19]:


# Compute classifier's accuracy on the test set
test_accuracy = rf.score(X_test, y_test)
print('Test accuracy: %.3f' % test_accuracy)


# ### Generate plots for the test set

# In[20]:


# Predict labels for the test set
y = rf.predict(X_test)             # Solid prediction
y_prob = rf.predict_proba(X_test)  # Soft prediction

# Compute the ROC and PR curves for the test set
fpr, tpr, _ = metrics.roc_curve(y_test, y_prob[:, 0], pos_label='+')
precision_plus, recall_plus, _ = metrics.precision_recall_curve(y_test, y_prob[:, 0], pos_label='+')
precision_minus, recall_minus, _ = metrics.precision_recall_curve(y_test, y_prob[:, 1], pos_label='-')

# Compute the AUROC and AUPRCs for the test set
auroc = metrics.auc(fpr, tpr)
auprc_plus = metrics.auc(recall_plus, precision_plus)
auprc_minus = metrics.auc(recall_minus, precision_minus)

# Compute the confusion matrix for the test set
cm = metrics.confusion_matrix(y_test, y)


# In[21]:


# Plot the ROC curve for the test set
plt.plot(fpr, tpr, label='test (area = %.3f)' % auroc)
plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6))
plt.plot([0,0,1], [0,1,1], linestyle='--', color=(0.6, 0.6, 0.6))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='lower right')
plt.title('ROC curve')

plt.show()


# In[22]:


# Plot the PR curve for the test set
plt.plot(recall_plus, precision_plus, label='positive class (area = %.3f)' % auprc_plus)
plt.plot(recall_minus, precision_minus, label='negative class (area = %.3f)' % auprc_minus)

plt.plot([0,1,1], [1,1,0], linestyle='--', color=(0.6, 0.6, 0.6))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend(loc='lower left')
plt.title('Precision-recall curve')

plt.show()


# In[23]:


# Plot the confusion matrix for the test set
plt.matshow(cm, interpolation='nearest', cmap=plt.cm.Blues, alpha=0.5)
plt.xticks(np.arange(2), ('+', '-'))
plt.yticks(np.arange(2), ('+', '-'))

for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i, j], ha='center', va='center')

plt.xlabel('Predicted label')
plt.ylabel('True label')

plt.show()


# ## STEP 4: Prediction

# In[31]:


# Enter unlabeled data for prediction
newdata = pd.read_csv("data/telomere_ALT/telomere_new.csv", sep='\t')
newdata.head(6)


# In[32]:


# predict labels
y_new = rf.predict(newdata)

y_new


# In[ ]: