#!/usr/bin/env python # coding: utf-8 # # About # # This notebook demonstrates stacking machine learning algorithm - folding, which physics use in their analysis. # In[1]: get_ipython().run_line_magic('pylab', 'inline') # # Loading data # ### download particle identification Data Set from UCI # In[2]: get_ipython().system('cd toy_datasets; wget -O MiniBooNE_PID.txt -nc MiniBooNE_PID.txt https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt') # In[3]: import numpy, pandas from rep.utils import train_test_split from sklearn.metrics import roc_auc_score data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python') labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None) labels = [1] * labels[1].values[0] + [0] * labels[2].values[0] data.columns = ['feature_{}'.format(key) for key in data.columns] train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5) # # Training variables # In[4]: variables = list(data.columns) # # Folding strategy - stacking algorithm # # It implements the same interface as all classifiers, but with some difference: # # * all prediction methods have additional parameter "vote\_function" (example folder.predict(X, __vote\_function=None)__), which is used to combine all classifiers' predictions. By default "mean" is used as "vote_function" # In[5]: from rep.estimators import SklearnClassifier from sklearn.ensemble import GradientBoostingClassifier # ## Define folding model # In[6]: from rep.metaml import FoldingClassifier # In[7]: n_folds = 4 folder = FoldingClassifier(GradientBoostingClassifier(), n_folds=n_folds, features=variables) folder.fit(train_data, train_labels) # ## Default prediction (predict i_th_ fold by i_th_ classifier) # In[8]: folder.predict_proba(train_data) # ## Voting prediction (predict i-fold by all classifiers and take value, which is calculated by `vote_function`) # In[9]: # definition of mean function, which combines all predictions def mean_vote(x): return numpy.mean(x, axis=0) # In[10]: folder.predict_proba(test_data, vote_function=mean_vote) # # Comparison of folds # Again use `ClassificationReport` class to compare different results. For folding classifier this report uses only __default prediction__. # ## Report training dataset # In[11]: from rep.data.storage import LabeledDataStorage from rep.report import ClassificationReport # add folds_column to dataset to use mask train_data["FOLDS"] = folder._get_folds_column(len(train_data)) lds = LabeledDataStorage(train_data, train_labels) report = ClassificationReport({'folding': folder}, lds) # ### Signal distribution for each fold # # Use `mask` parameter to plot distribution for the specific fold # In[12]: for fold_num in range(n_folds): report.prediction_pdf(mask="FOLDS == %d" % fold_num, labels_dict={1: 'sig fold %d' % fold_num}).plot() # ### Background distribution for each fold # In[13]: for fold_num in range(n_folds): report.prediction_pdf(mask="FOLDS == %d" % fold_num, labels_dict={0: 'bck fold %d' % fold_num}).plot() # ### ROCs (each fold used as test dataset) # In[14]: for fold_num in range(n_folds): report.roc(mask="FOLDS == %d" % fold_num).plot() # ## Report for test dataset # # __NOTE__: Here vote function is None, so default prediction is used # In[15]: lds = LabeledDataStorage(test_data, test_labels) report = ClassificationReport({'folding': folder}, lds) # In[16]: report.prediction_pdf().plot(new_plot=True, figsize = (9, 4)) # In[17]: report.roc().plot(xlim=(0.5, 1))