#!/usr/bin/env python
# coding: utf-8

# # Tf-idf Vectorizer, Logistic Regression

# In[1]:


from setup_corpus import build_corpora
corpora = build_corpora()
print(corpora)


# doing something haha

# In[2]:


from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, precision_recall_curve, average_precision_score
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))

cv = StratifiedKFold(n_splits = 5, shuffle=True)


# In[3]:


for category in corpora:
    scores = cross_val_score(pipeline, corpora[category].excerpt, corpora[category][category], cv=cv)
    print(f"Category: {category}\nScores: {scores}\nAccuracy: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")


# In[4]:


import numpy as np
from scipy import interp
import matplotlib.pyplot as plt


# In[5]:


X = corpora['description'].excerpt
y = corpora['description'].description
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Description ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Description Classification')
plt.legend(loc="lower right")
plt.show()


# In[6]:


for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Description Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()


# In[7]:


X = corpora['installation'].excerpt
y = corpora['installation'].installation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Installation Classification')
plt.legend(loc="lower right")
plt.show()


# In[8]:


for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Installation Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()


# In[9]:


X = corpora['invocation'].excerpt
y = corpora['invocation'].invocation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Invocation Classification')
plt.legend(loc="lower right")
plt.show()


# In[10]:


for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Invocation Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()


# In[11]:


X = corpora['citation'].excerpt
y = corpora['citation'].citation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Citation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Citation Classification')
plt.legend(loc="lower right")
plt.show()


# In[12]:


for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Citation Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()