#!/usr/bin/env python # coding: utf-8 # # About # # This notebook demonstrates classifiers, which are provided by __Reproducible experiment platform (REP)__ package.
REP contains following classifiers # * __scikit-learn__ # * __TMVA__ # * __XGBoost__ # * estimators from __hep_ml__ # * __theanets__ # * __PyBrain__ # * __Neurolab__ # # (and any `sklearn`-compatible classifiers may be used). # # Neural network libraries are introduced in different notebook. # # ### In this notebook we show the most simple way to # * train classifier # * build predictions # * measure quality # * combine metaclassifiers # # # # Loading data # ### download particle identification Data Set from UCI # In[1]: get_ipython().system('cd toy_datasets; wget -O MiniBooNE_PID.txt -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt') # In[2]: import numpy, pandas from rep.utils import train_test_split from sklearn.metrics import roc_auc_score data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python') labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None) labels = [1] * labels[1].values[0] + [0] * labels[2].values[0] data.columns = ['feature_{}'.format(key) for key in data.columns] # ### First rows of our data # In[3]: data[:5] # ### Splitting into train and test # In[4]: # Get train and test data train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.25) # # Classifiers # # All classifiers inherit from __sklearn.BaseEstimator__ and have the following methods: # # * `classifier.fit(X, y, sample_weight=None)` - train classifier # # * `classifier.predict_proba(X)` - return probabilities vector for all classes # # * `classifier.predict(X)` - return predicted labels # # * `classifier.staged_predict_proba(X)` - return probabilities after each iteration (not supported by TMVA) # # * `classifier.get_feature_importances()` # # # Here we use `X` to denote matrix with data of shape `[n_samples, n_features]`, `y` is vector with labels (0 or 1) of shape [n_samples],
`sample_weight` is vector with weights. # # # ## Difference from default scikit-learn interface # X should be* `pandas.DataFrame`, `not numpy.array`.
# Provided this, you'll be able to choose features used in training by setting e.g. `features=['FlightTime', 'p']` in constructor. # # \* it works fine with `numpy.array` as well, but in this case all the features will be used. # # Variables used in training # In[5]: variables = list(data.columns[:15]) # # Sklearn # wrapper for scikit-learn classifiers. In this example we use GradientBoosting with default settings # In[6]: from rep.estimators import SklearnClassifier from sklearn.ensemble import GradientBoostingClassifier # Using gradient boosting with default settings sk = SklearnClassifier(GradientBoostingClassifier(), features=variables) # Training classifier sk.fit(train_data, train_labels) print('training complete') # ### Predicting probabilities, measuring the quality # In[7]: # predict probabilities for each class prob = sk.predict_proba(test_data) print prob # In[8]: print 'ROC AUC', roc_auc_score(test_labels, prob[:, 1]) # ### Predictions of classes # In[9]: sk.predict(test_data) # In[10]: sk.get_feature_importances() # ## TMVA # In[11]: from rep.estimators import TMVAClassifier print TMVAClassifier.__doc__ # In[12]: tmva = TMVAClassifier(method='kBDT', NTrees=50, Shrinkage=0.05, features=variables) tmva.fit(train_data, train_labels) print('training complete') # ### Predict probabilities and estimate quality # In[13]: # predict probabilities for each class prob = tmva.predict_proba(test_data) print prob # In[14]: print 'ROC AUC', roc_auc_score(test_labels, prob[:, 1]) # In[15]: # predict labels tmva.predict(test_data) # ## XGBoost # In[16]: from rep.estimators import XGBoostClassifier print XGBoostClassifier.__doc__ # In[17]: # XGBoost with default parameters xgb = XGBoostClassifier(features=variables) xgb.fit(train_data, train_labels) print('training complete') # ### Predict probabilities and estimate quality # In[18]: prob = xgb.predict_proba(test_data) print 'ROC AUC:', roc_auc_score(test_labels, prob[:, 1]) # ### Predict labels # In[19]: xgb.predict(test_data) # In[20]: xgb.get_feature_importances() # # Advantages of common interface # As one can see above, all the classifiers implement the same interface, # this simplifies work, simplifies comparison of different classifiers, # but this is not the only profit. # # `Sklearn` provides different tools to combine different classifiers and transformers. # One of this tools is `AdaBoost`, which is abstract metaclassifier built on the top of some other classifier (usually, decision dree). Also bagging is other frequently used ensembling meta-algorithm. # # Let's show that now you can run AdaBoost over classifiers from other libraries!
# _(isn't boosting over neural network what you were dreaming of all your life?)_ # ## AdaBoost over XGBoost # In[21]: from sklearn.ensemble import AdaBoostClassifier # In[22]: get_ipython().run_cell_magic('time', '', "base_xgb = XGBoostClassifier(n_estimators=20)\nada_xgb = SklearnClassifier(AdaBoostClassifier(base_estimator=base_xgb, n_estimators=5))\nada_xgb.fit(train_data[variables], train_labels)\nprint('training complete!')\n\n# predict probabilities for each class\nprob = ada_xgb.predict_proba(test_data[variables])\nprint 'AUC', roc_auc_score(test_labels, prob[:, 1])\n\n# predict probabilities for each class\nprob = ada_xgb.predict_proba(train_data[variables])\nprint 'AUC', roc_auc_score(train_labels, prob[:, 1])\n") # ## AdaBoost over TMVA classifier # # the following code shows that you can do the same with i.e. TMVA, uncomment it to try # In[23]: # base_tmva = TMVAClassifier(method='kBDT', NTrees=20) # ada_tmva = SklearnClassifier(AdaBoostClassifier(base_estimator=base_tmva, n_estimators=5), features=variables) # ada_tmva.fit(train_data, train_labels) # print('training complete') # prob = ada_tmva.predict_proba(test_data) # print 'AUC', roc_auc_score(test_labels, prob[:, 1]) # # Other advantages of common interface # There are many things you can do with classifiers now: # * cloning # * getting / setting parameters as dictionaries # * automatic hyperparameter optimization # * build pipelines (`sklearn.pipeline`) # * use hierarchical training, training on subsets # * passing over internet / train classifiers on other machines # # And you can replace classifiers at any moment. # ## Exercises # # Exercise 1. Play with parameters in each type of classifiers # # Exercise 2. Add weight column and train models with weights in training