#!/usr/bin/env python
# coding: utf-8

# # About
# 
# This notebook demonstrates stacking machine learning algorithm - folding, which physics use in their analysis.

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')


# # Loading data

# ### download particle identification Data Set from UCI

# In[2]:


get_ipython().system('cd toy_datasets; wget -O MiniBooNE_PID.txt -nc MiniBooNE_PID.txt https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt')


# In[3]:


import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score

data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python')
labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None)
labels = [1] * labels[1].values[0] + [0] * labels[2].values[0]
data.columns = ['feature_{}'.format(key) for key in data.columns]

train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5)


# # Training variables

# In[4]:


variables = list(data.columns)


# # Folding strategy - stacking algorithm
# 
# It implements the same interface as all classifiers, but with some difference:
# 
# * all prediction methods have additional parameter "vote\_function" (example folder.predict(X, __vote\_function=None)__), which is used to combine all classifiers' predictions. By default "mean" is used as "vote_function"

# In[5]:


from rep.estimators import SklearnClassifier
from sklearn.ensemble import GradientBoostingClassifier


# ## Define folding model

# In[6]:


from rep.metaml import FoldingClassifier


# In[7]:


n_folds = 4
folder = FoldingClassifier(GradientBoostingClassifier(), n_folds=n_folds, features=variables)
folder.fit(train_data, train_labels)


# ## Default prediction (predict i_th_ fold by i_th_ classifier)

# In[8]:


folder.predict_proba(train_data)


# ## Voting prediction (predict i-fold by all classifiers and take value, which is calculated by `vote_function`)

# In[9]:


# definition of mean function, which combines all predictions
def mean_vote(x):
    return numpy.mean(x, axis=0)


# In[10]:


folder.predict_proba(test_data, vote_function=mean_vote)


# # Comparison of folds

# Again use `ClassificationReport` class to compare different results. For folding classifier this report uses only __default prediction__.

# ## Report training dataset

# In[11]:


from rep.data.storage import LabeledDataStorage
from rep.report import ClassificationReport
# add folds_column to dataset to use mask
train_data["FOLDS"] = folder._get_folds_column(len(train_data))
lds = LabeledDataStorage(train_data, train_labels)

report = ClassificationReport({'folding': folder}, lds)


# ### Signal distribution for each fold
# 
# Use `mask` parameter to plot distribution for the specific fold 

# In[12]:


for fold_num in range(n_folds):
    report.prediction_pdf(mask="FOLDS == %d" % fold_num, labels_dict={1: 'sig fold %d' % fold_num}).plot()


# ### Background distribution for each fold

# In[13]:


for fold_num in range(n_folds):
    report.prediction_pdf(mask="FOLDS == %d" % fold_num, labels_dict={0: 'bck fold %d' % fold_num}).plot()


# ### ROCs (each fold used as test dataset)

# In[14]:


for fold_num in range(n_folds):
    report.roc(mask="FOLDS == %d" % fold_num).plot()


# ## Report for test dataset 
# 
# __NOTE__: Here vote function is None, so default prediction is used

# In[15]:


lds = LabeledDataStorage(test_data, test_labels)

report = ClassificationReport({'folding': folder}, lds)


# In[16]:


report.prediction_pdf().plot(new_plot=True, figsize = (9, 4))


# In[17]:


report.roc().plot(xlim=(0.5, 1))