#!/usr/bin/env python
# coding: utf-8
# # About
#
# This notebook demonstrates classifiers, which are provided by __Reproducible experiment platform (REP)__ package.
REP contains following classifiers
# * __scikit-learn__
# * __TMVA__
# * __XGBoost__
# * estimators from __hep_ml__
# * __theanets__
# * __PyBrain__
# * __Neurolab__
#
# (and any `sklearn`-compatible classifiers may be used).
#
# Neural network libraries are introduced in different notebook.
#
# ### In this notebook we show the most simple way to
# * train classifier
# * build predictions
# * measure quality
# * combine metaclassifiers
#
#
# # Loading data
# ### download particle identification Data Set from UCI
# In[1]:
get_ipython().system('cd toy_datasets; wget -O MiniBooNE_PID.txt -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt')
# In[2]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python')
labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None)
labels = [1] * labels[1].values[0] + [0] * labels[2].values[0]
data.columns = ['feature_{}'.format(key) for key in data.columns]
# ### First rows of our data
# In[3]:
data[:5]
# ### Splitting into train and test
# In[4]:
# Get train and test data
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.25)
# # Classifiers
#
# All classifiers inherit from __sklearn.BaseEstimator__ and have the following methods:
#
# * `classifier.fit(X, y, sample_weight=None)` - train classifier
#
# * `classifier.predict_proba(X)` - return probabilities vector for all classes
#
# * `classifier.predict(X)` - return predicted labels
#
# * `classifier.staged_predict_proba(X)` - return probabilities after each iteration (not supported by TMVA)
#
# * `classifier.get_feature_importances()`
#
#
# Here we use `X` to denote matrix with data of shape `[n_samples, n_features]`, `y` is vector with labels (0 or 1) of shape [n_samples],
`sample_weight` is vector with weights.
#
#
# ## Difference from default scikit-learn interface
# X should be* `pandas.DataFrame`, `not numpy.array`.
# Provided this, you'll be able to choose features used in training by setting e.g. `features=['FlightTime', 'p']` in constructor.
#
# \* it works fine with `numpy.array` as well, but in this case all the features will be used.
# # Variables used in training
# In[5]:
variables = list(data.columns[:15])
# # Sklearn
# wrapper for scikit-learn classifiers. In this example we use GradientBoosting with default settings
# In[6]:
from rep.estimators import SklearnClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Using gradient boosting with default settings
sk = SklearnClassifier(GradientBoostingClassifier(), features=variables)
# Training classifier
sk.fit(train_data, train_labels)
print('training complete')
# ### Predicting probabilities, measuring the quality
# In[7]:
# predict probabilities for each class
prob = sk.predict_proba(test_data)
print prob
# In[8]:
print 'ROC AUC', roc_auc_score(test_labels, prob[:, 1])
# ### Predictions of classes
# In[9]:
sk.predict(test_data)
# In[10]:
sk.get_feature_importances()
# ## TMVA
# In[11]:
from rep.estimators import TMVAClassifier
print TMVAClassifier.__doc__
# In[12]:
tmva = TMVAClassifier(method='kBDT', NTrees=50, Shrinkage=0.05, features=variables)
tmva.fit(train_data, train_labels)
print('training complete')
# ### Predict probabilities and estimate quality
# In[13]:
# predict probabilities for each class
prob = tmva.predict_proba(test_data)
print prob
# In[14]:
print 'ROC AUC', roc_auc_score(test_labels, prob[:, 1])
# In[15]:
# predict labels
tmva.predict(test_data)
# ## XGBoost
# In[16]:
from rep.estimators import XGBoostClassifier
print XGBoostClassifier.__doc__
# In[17]:
# XGBoost with default parameters
xgb = XGBoostClassifier(features=variables)
xgb.fit(train_data, train_labels)
print('training complete')
# ### Predict probabilities and estimate quality
# In[18]:
prob = xgb.predict_proba(test_data)
print 'ROC AUC:', roc_auc_score(test_labels, prob[:, 1])
# ### Predict labels
# In[19]:
xgb.predict(test_data)
# In[20]:
xgb.get_feature_importances()
# # Advantages of common interface
# As one can see above, all the classifiers implement the same interface,
# this simplifies work, simplifies comparison of different classifiers,
# but this is not the only profit.
#
# `Sklearn` provides different tools to combine different classifiers and transformers.
# One of this tools is `AdaBoost`, which is abstract metaclassifier built on the top of some other classifier (usually, decision dree). Also bagging is other frequently used ensembling meta-algorithm.
#
# Let's show that now you can run AdaBoost over classifiers from other libraries!
# _(isn't boosting over neural network what you were dreaming of all your life?)_
# ## AdaBoost over XGBoost
# In[21]:
from sklearn.ensemble import AdaBoostClassifier
# In[22]:
get_ipython().run_cell_magic('time', '', "base_xgb = XGBoostClassifier(n_estimators=20)\nada_xgb = SklearnClassifier(AdaBoostClassifier(base_estimator=base_xgb, n_estimators=5))\nada_xgb.fit(train_data[variables], train_labels)\nprint('training complete!')\n\n# predict probabilities for each class\nprob = ada_xgb.predict_proba(test_data[variables])\nprint 'AUC', roc_auc_score(test_labels, prob[:, 1])\n\n# predict probabilities for each class\nprob = ada_xgb.predict_proba(train_data[variables])\nprint 'AUC', roc_auc_score(train_labels, prob[:, 1])\n")
# ## AdaBoost over TMVA classifier
#
# the following code shows that you can do the same with i.e. TMVA, uncomment it to try
# In[23]:
# base_tmva = TMVAClassifier(method='kBDT', NTrees=20)
# ada_tmva = SklearnClassifier(AdaBoostClassifier(base_estimator=base_tmva, n_estimators=5), features=variables)
# ada_tmva.fit(train_data, train_labels)
# print('training complete')
# prob = ada_tmva.predict_proba(test_data)
# print 'AUC', roc_auc_score(test_labels, prob[:, 1])
# # Other advantages of common interface
# There are many things you can do with classifiers now:
# * cloning
# * getting / setting parameters as dictionaries
# * automatic hyperparameter optimization
# * build pipelines (`sklearn.pipeline`)
# * use hierarchical training, training on subsets
# * passing over internet / train classifiers on other machines
#
# And you can replace classifiers at any moment.
# ## Exercises
#
# Exercise 1. Play with parameters in each type of classifiers
#
# Exercise 2. Add weight column and train models with weights in training