Naive scikit-learn example

In [1]:
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
In [2]:
# Loading Iris dataset
X, y = datasets.load_iris(return_X_y=True)
print(X.shape, y.shape)
(150, 4) (150,)
In [3]:
# Initializing a Random Forest with arbitrary hyperparameters
# max_depth kept as 2 since Iris has only 4 features
clf = RandomForestClassifier(n_estimators=10, max_depth=2)
In [4]:
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("Mean score : {:.5f}".format(scores.mean()))
Mean score : 0.94667

scikit-learn as OpenML components

In [5]:
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

Dataset component

In [6]:
# To load IRIS dataset as a dataset module/component
def dataset():
    X, y = datasets.load_iris(return_X_y=True)
    return X, y

Task component

In [7]:
# Tasks here define the number of cross-validation folds
# and the scoring metric to be used for evaluation
def task_1(f):
    X, y = dataset()  # loads IRIS
    return cross_val_score(f, X, y, cv=5, scoring='accuracy')

def task_2(f):
    X, y = dataset()  # loads IRIS
    return cross_val_score(f, X, y, cv=15, scoring='balanced_accuracy')

Flow component

In [8]:
# Flows determine the modelling technique to be applied
# Helps define model irrespective of dataset or tasks
def flow_1():
    clf = RandomForestClassifier(n_estimators=10, max_depth=2)
    return clf

def flow_2():
    clf = SVC(gamma='auto', kernel='linear')
    return clf

Run component

In [9]:
# Runs essentially evaluates a task-flow pairing and
# therefore in effect executs the modelling of a dataset
# as per the task task definition
def run(task, flow):
    return task(flow)
In [10]:
# Results for Random Forest
print("RF using task 1: {:<.5}; task 2: {:<.5}".format(run(task_1, flow_1()).mean(), run(task_2, flow_1()).mean()))
# Results for SVM
print("SVM using task 1: {:<.5}; task 2: {:<.5}".format(run(task_1, flow_2()).mean(), run(task_2, flow_2()).mean()))
RF using task 1: 0.96667; task 2: 0.95741
SVM using task 1: 0.98; task 2: 0.97222

Using OpenML

In [11]:
import openml

import numpy as np
import pandas as pd
In [12]:
# Fetching the list of all available datasets on OpenML
d = openml.datasets.list_datasets(output_format='dataframe')
print(d.shape)

# Listing column names or attributes that OpenML offers
for name in d.columns:
    print(name)
(2958, 16)
did
name
version
uploader
status
format
MajorityClassSize
MaxNominalAttDistinctValues
MinorityClassSize
NumberOfClasses
NumberOfFeatures
NumberOfInstances
NumberOfInstancesWithMissingValues
NumberOfMissingValues
NumberOfNumericFeatures
NumberOfSymbolicFeatures
In [13]:
print(d.head())
   did        name  version uploader  status format  MajorityClassSize  \
2    2      anneal        1        1  active   ARFF              684.0   
3    3    kr-vs-kp        1        1  active   ARFF             1669.0   
4    4       labor        1        1  active   ARFF               37.0   
5    5  arrhythmia        1        1  active   ARFF              245.0   
6    6      letter        1        1  active   ARFF              813.0   

   MaxNominalAttDistinctValues  MinorityClassSize  NumberOfClasses  \
2                          7.0                8.0              5.0   
3                          3.0             1527.0              2.0   
4                          3.0               20.0              2.0   
5                         13.0                2.0             13.0   
6                         26.0              734.0             26.0   

   NumberOfFeatures  NumberOfInstances  NumberOfInstancesWithMissingValues  \
2              39.0              898.0                               898.0   
3              37.0             3196.0                                 0.0   
4              17.0               57.0                                56.0   
5             280.0              452.0                               384.0   
6              17.0            20000.0                                 0.0   

   NumberOfMissingValues  NumberOfNumericFeatures  NumberOfSymbolicFeatures  
2                22175.0                      6.0                      33.0  
3                    0.0                      0.0                      37.0  
4                  326.0                      8.0                       9.0  
5                  408.0                    206.0                      74.0  
6                    0.0                     16.0                       1.0  
In [38]:
# Filtering dataset list to have 'iris' in the 'name' column
# then sorting the list based on the 'version'
d[d['name'].str.contains('iris')].sort_values(by='version').head()
Out[38]:
did name version uploader status format MajorityClassSize MaxNominalAttDistinctValues MinorityClassSize NumberOfClasses NumberOfFeatures NumberOfInstances NumberOfInstancesWithMissingValues NumberOfMissingValues NumberOfNumericFeatures NumberOfSymbolicFeatures
61 61 iris 1 1 active ARFF 50.0 3.0 50.0 3.0 5.0 150.0 0.0 0.0 4.0 1.0
41950 41950 iris_test_upload 1 4030 active ARFF 50.0 3.0 50.0 3.0 5.0 150.0 0.0 0.0 4.0 1.0
451 451 irish 1 2 active ARFF 278.0 10.0 222.0 2.0 6.0 500.0 32.0 32.0 2.0 4.0
969 969 iris 3 2 active ARFF 100.0 2.0 50.0 2.0 5.0 150.0 0.0 0.0 4.0 1.0
41510 41510 iris 9 348 active ARFF NaN 3.0 NaN NaN 5.0 150.0 0.0 0.0 4.0 1.0

Retrieving the IRIS dataset from OpenML

In [15]:
iris = openml.datasets.get_dataset(61)
iris
Out[15]:
OpenML Dataset
==============
Name..........: iris
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:23:39
Licence.......: Public
Download URL..: https://www.openml.org/data/v1/download/61/iris.arff
OpenML URL....: https://www.openml.org/d/61
# of features.: 5
# of instances: 150
In [16]:
iris.features
Out[16]:
{0: [0 - sepallength (numeric)],
 1: [1 - sepalwidth (numeric)],
 2: [2 - petallength (numeric)],
 3: [3 - petalwidth (numeric)],
 4: [4 - class (nominal)]}
In [17]:
print(iris.description)
**Author**: R.A. Fisher  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  
**Please cite**:   

**Iris Plants Database**  
This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda & Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.

Predicted attribute: class of iris plant.  
This is an exceedingly simple domain.  
 
### Attribute Information:
    1. sepal length in cm
    2. sepal width in cm
    3. petal length in cm
    4. petal width in cm
    5. class: 
       -- Iris Setosa
       -- Iris Versicolour
       -- Iris Virginica

Retrieving the OpenML tasks that used Iris

In [18]:
df = openml.tasks.list_tasks(data_id=61, output_format='dataframe')
df.head()
Out[18]:
tid ttid did name task_type status estimation_procedure evaluation_measures source_data target_feature ... NumberOfFeatures NumberOfInstances NumberOfInstancesWithMissingValues NumberOfMissingValues NumberOfNumericFeatures NumberOfSymbolicFeatures number_samples cost_matrix quality_measure target_value
59 59 1 61 iris Supervised Classification active 10-fold Crossvalidation predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN
118 118 3 61 iris Learning Curve active 10 times 10-fold Learning Curve predictive_accuracy 61 class ... 5 150 0 0 4 1 4 NaN NaN NaN
289 289 1 61 iris Supervised Classification active 33% Holdout set predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN
1758 1758 3 61 iris Learning Curve active 10-fold Learning Curve predictive_accuracy 61 class ... 5 150 0 0 4 1 4 NaN NaN NaN
1823 1823 1 61 iris Supervised Classification active 5 times 2-fold Crossvalidation predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN

5 rows × 24 columns

In [19]:
# Filtering only the Supervised Classification tasks on Iris
df.query("task_type=='Supervised Classification'").head()
Out[19]:
tid ttid did name task_type status estimation_procedure evaluation_measures source_data target_feature ... NumberOfFeatures NumberOfInstances NumberOfInstancesWithMissingValues NumberOfMissingValues NumberOfNumericFeatures NumberOfSymbolicFeatures number_samples cost_matrix quality_measure target_value
59 59 1 61 iris Supervised Classification active 10-fold Crossvalidation predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN
289 289 1 61 iris Supervised Classification active 33% Holdout set predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN
1823 1823 1 61 iris Supervised Classification active 5 times 2-fold Crossvalidation predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN
1939 1939 1 61 iris Supervised Classification active 10 times 10-fold Crossvalidation predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN
1992 1992 1 61 iris Supervised Classification active Leave one out predictive_accuracy 61 class ... 5 150 0 0 4 1 NaN NaN NaN NaN

5 rows × 24 columns

In [62]:
# Collecting all task_ids
tasks = df.query("task_type=='Supervised Classification'")['tid'].to_numpy()
print(len(tasks))
11
In [63]:
# Listing all evaluations made on the 11 tasks collected above
# with evaluation metric as 'predictive_accuracy'
task_df = openml.evaluations.list_evaluations(function='predictive_accuracy', task=tasks, output_format='dataframe')
task_df.head()
Out[63]:
run_id task_id setup_id flow_id flow_name data_id data_name function upload_time uploader uploader_name value values array_data
0 81 59 12 67 weka.BayesNet_K2(1) 61 iris predictive_accuracy 2014-04-07 00:05:11 1 [email protected] 0.940000 None None
1 161 59 13 70 weka.SMO_PolyKernel(1) 61 iris predictive_accuracy 2014-04-07 00:55:32 1 [email protected] 0.960000 None None
2 234 59 1 56 weka.ZeroR(1) 61 iris predictive_accuracy 2014-04-07 01:33:24 1 [email protected] 0.333333 None None
3 447 59 6 61 weka.REPTree(1) 61 iris predictive_accuracy 2014-04-07 06:26:27 1 [email protected] 0.926667 None None
4 473 59 18 77 weka.LogitBoost_DecisionStump(1) 61 iris predictive_accuracy 2014-04-07 06:39:27 1 [email protected] 0.946667 None None

Filtering tasks retrieved

In [64]:
# Filtering based on sklearn (scikit-learn) modules
task_df = task_df[task_df['flow_name'].str.contains("sklearn")]
task_df.head()
Out[64]:
run_id task_id setup_id flow_id flow_name data_id data_name function upload_time uploader uploader_name value values array_data
144 1849043 59 29015 5500 sklearn.ensemble.forest.RandomForestClassifier... 61 iris predictive_accuracy 2017-03-03 17:10:12 1 [email protected] 0.946667 None None
145 1853409 59 30950 5873 sklearn.pipeline.Pipeline(Imputer=openml.utils... 61 iris predictive_accuracy 2017-03-21 22:08:01 1 [email protected] 0.960000 None None
146 6130126 59 4163633 7108 sklearn.model_selection._search.RandomizedSear... 61 iris predictive_accuracy 2017-08-21 11:07:40 1 [email protected] 0.960000 None None
147 6130128 59 4163634 7108 sklearn.model_selection._search.RandomizedSear... 61 iris predictive_accuracy 2017-08-21 11:08:06 1 [email protected] 0.946667 None None
148 6715383 59 4747289 7117 sklearn.model_selection._search.RandomizedSear... 61 iris predictive_accuracy 2017-09-01 02:56:44 1 [email protected] 0.960000 None None
In [65]:
# Counting frequency of the different tasks used to
# solve Iris as a supervised classification using scikit-learn
task_df['task_id'].value_counts()
Out[65]:
59       1984
10107      25
289         1
Name: task_id, dtype: int64
In [67]:
# Retrieving the most used task
t = openml.tasks.get_task(59)
t
Out[67]:
OpenML Classification Task
==========================
Task Type Description: https://www.openml.org/tt/1
Task ID..............: 59
Task URL.............: https://www.openml.org/t/59
Estimation Procedure.: crossvalidation
Evaluation Measure...: predictive_accuracy
Target Feature.......: class
# of Classes.........: 3
Cost Matrix..........: Available
In [72]:
# Filtering for only task_id=59
task_df = task_df.query("task_id==59")
In [73]:
# Filtering based on Random Forest as the flow name
task_rf =  task_df[task_df['flow_name'].str.contains("RandomForest")]
task_rf.head()
Out[73]:
run_id task_id setup_id flow_id flow_name data_id data_name function upload_time uploader uploader_name value values array_data
144 1849043 59 29015 5500 sklearn.ensemble.forest.RandomForestClassifier... 61 iris predictive_accuracy 2017-03-03 17:10:12 1 [email protected] 0.946667 None None
145 1853409 59 30950 5873 sklearn.pipeline.Pipeline(Imputer=openml.utils... 61 iris predictive_accuracy 2017-03-21 22:08:01 1 [email protected] 0.960000 None None
146 6130126 59 4163633 7108 sklearn.model_selection._search.RandomizedSear... 61 iris predictive_accuracy 2017-08-21 11:07:40 1 [email protected] 0.960000 None None
147 6130128 59 4163634 7108 sklearn.model_selection._search.RandomizedSear... 61 iris predictive_accuracy 2017-08-21 11:08:06 1 [email protected] 0.946667 None None
190 6946499 59 4978397 7109 sklearn.pipeline.Pipeline(imputation=openmlstu... 61 iris predictive_accuracy 2017-09-02 22:06:32 1 [email protected] 0.920000 None None
In [74]:
task_rf.sort_values(by='value', ascending=False).head()
Out[74]:
run_id task_id setup_id flow_id flow_name data_id data_name function upload_time uploader uploader_name value values array_data
3549 523926 59 3526 2629 sklearn.ensemble.forest.RandomForestClassifier(8) 61 iris predictive_accuracy 2016-02-11 22:05:23 869 [email protected] 0.966667 None None
4353 8955370 59 6890988 7257 sklearn.ensemble.forest.RandomForestClassifier... 61 iris predictive_accuracy 2018-04-06 16:32:22 3964 [email protected] 0.960000 None None
3587 1852682 59 29263 5500 sklearn.ensemble.forest.RandomForestClassifier... 61 iris predictive_accuracy 2017-03-15 22:55:18 1022 [email protected] 0.960000 None None
4375 8886608 59 6835139 7961 sklearn.pipeline.Pipeline(Imputer=sklearn.prep... 61 iris predictive_accuracy 2018-03-17 16:46:27 5032 [email protected] 0.960000 None None
3107 1843272 59 24071 4830 sklearn.ensemble.forest.RandomForestClassifier... 61 iris predictive_accuracy 2016-12-08 20:10:03 2 [email protected] 0.960000 None None
In [75]:
# Fetching the Random Forest flow with the best score
f = openml.flows.get_flow(2629)
f
Out[75]:
OpenML Flow
===========
Flow ID.........: 2629 (version 8)
Flow URL........: https://www.openml.org/f/2629
Flow Name.......: sklearn.ensemble.forest.RandomForestClassifier
Flow Description: Flow generated by openml_run
Upload Date.....: 2016-02-11 21:17:08
Dependencies....: None
In [77]:
# Fetching the run with the best score for
# Random Forest on Iris
r = openml.runs.get_run(523926)
r
Out[77]:
OpenML Run
==========
Uploader Name...: Pieter Gijsbers
Uploader Profile: https://www.openml.org/u/869
Metric..........: predictive_accuracy
Result..........: 0.966667
Run ID..........: 523926
Run URL.........: https://www.openml.org/r/523926
Task ID.........: 59
Task Type.......: Supervised Classification
Task URL........: https://www.openml.org/t/59
Flow ID.........: 2629
Flow Name.......: sklearn.ensemble.forest.RandomForestClassifier(8)
Flow URL........: https://www.openml.org/f/2629
Setup ID........: 3526
Setup String....: None
Dataset ID......: 61
Dataset URL.....: https://www.openml.org/d/61
In [78]:
# The scoring metric used
t.evaluation_measure
Out[78]:
'predictive_accuracy'
In [79]:
# The methodology used for estimations
t.estimation_procedure
Out[79]:
{'type': 'crossvalidation',
 'parameters': {'number_repeats': '1',
  'number_folds': '10',
  'percentage': '',
  'stratified_sampling': 'true'},
 'data_splits_url': 'https://www.openml.org/api_splits/get/59/Task_59_splits.arff'}
In [80]:
# The model used
f.name
Out[80]:
'sklearn.ensemble.forest.RandomForestClassifier'
In [81]:
# The model parameters
for param in r.parameter_settings:
    name, value = param['oml:name'], param['oml:value']
    print("{:<25} : {:<10}".format(name, value))
warm_start                : False     
oob_score                 : False     
n_jobs                    : 1         
verbose                   : 0         
max_leaf_nodes            : None      
bootstrap                 : True      
min_samples_leaf          : 1         
n_estimators              : 10        
min_samples_split         : 2         
min_weight_fraction_leaf  : 0.0       
criterion                 : gini      
random_state              : None      
max_features              : auto      
max_depth                 : None      
class_weight              : None      

Retrieving top results on task 59

In [85]:
# # Fetching top results
# df = openml.tasks.list_tasks(data_id=61, output_format='dataframe')
# tasks = df.query("task_type=='Supervised Classification'")['tid'].to_numpy()
# tdf = openml.evaluations.list_evaluations(function='predictive_accuracy', task=tasks, output_format='dataframe')
# tdf = tdf[tdf['flow_name'].str.contains("sklearn")]
# Sorting and displaying the top 5 results
task_df.sort_values(by='value', ascending=False).head()
Out[85]:
run_id task_id setup_id flow_id flow_name data_id data_name function upload_time uploader uploader_name value values array_data
3626 2012941 59 157624 6048 sklearn.pipeline.Pipeline(dualimputer=helper.d... 61 iris predictive_accuracy 2017-04-07 01:36:00 1104 [email protected] 0.986667 None None
3618 2012930 59 157613 6048 sklearn.pipeline.Pipeline(dualimputer=helper.d... 61 iris predictive_accuracy 2017-04-06 23:00:24 1104 [email protected] 0.986667 None None
3633 2083536 59 217067 6049 sklearn.svm.classes.NuSVC(1) 61 iris predictive_accuracy 2017-04-23 01:13:21 1104 [email protected] 0.986667 None None
3631 2039750 59 180924 6048 sklearn.pipeline.Pipeline(dualimputer=helper.d... 61 iris predictive_accuracy 2017-04-09 01:17:39 1104 [email protected] 0.986667 None None
3630 2039748 59 180922 6048 sklearn.pipeline.Pipeline(dualimputer=helper.d... 61 iris predictive_accuracy 2017-04-09 01:09:01 1104 [email protected] 0.986667 None None
In [86]:
# Fetching best performing flow
f = openml.flows.get_flow(6048)
f
Out[86]:
OpenML Flow
===========
Flow ID.........: 6048 (version 1)
Flow URL........: https://www.openml.org/f/6048
Flow Name.......: sklearn.pipeline.Pipeline(dualimputer=helper.dual_imputer.DualImputer,nusvc=sklearn.svm.classes.NuSVC)
Flow Description: Automatically created scikit-learn flow.
Upload Date.....: 2017-04-06 22:42:59
Dependencies....: sklearn==0.18.1
numpy>=1.6.1
scipy>=0.9
In [87]:
# Fetching best performing run
r = openml.runs.get_run(2012943)

# The model parameters
for param in r.parameter_settings:
    name, value = param['oml:name'], param['oml:value']
    print("{:<25} : {:<10}".format(name, value))
steps                     : [('DualImputer', <helper.dual_imputer.DualImputer object at 0x7ff618e4d908>), ('nusvc', NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, nu=0.3, probability=True, random_state=3, shrinking=True,
   tol=3.2419092644286417e-05, verbose=False))]
cache_size                : 200       
class_weight              : None      
coef0                     : 0.0       
decision_function_shape   : None      
degree                    : 3         
gamma                     : auto      
kernel                    : linear    
max_iter                  : -1        
nu                        : 0.3       
probability               : True      
random_state              : 3         
shrinking                 : True      
tol                       : 3.24190926443e-05
verbose                   : False     

Running best found flow on required task

In [88]:
import openml
import numpy as np
from sklearn.svm import NuSVC
In [89]:
# Building the NuSVC model object with parameters found
clf = NuSVC(cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, nu=0.3, probability=True, random_state=3, shrinking=True,
   tol=3.2419092644286417e-05, verbose=False)
In [90]:
# Viewing task used earlier
t
Out[90]:
OpenML Classification Task
==========================
Task Type Description: https://www.openml.org/tt/1
Task ID..............: 59
Task URL.............: https://www.openml.org/t/59
Estimation Procedure.: crossvalidation
Evaluation Measure...: predictive_accuracy
Target Feature.......: class
# of Classes.........: 3
Cost Matrix..........: Available
In [91]:
# Running the model on the task
# Internally, the model will be made into 
# an OpenML flow and we can choose to retrieve it
r, f = openml.runs.run_model_on_task(model=clf, task=t, upload_flow=False, return_flow=True)
f
Out[91]:
OpenML Flow
===========
Flow Name.......: sklearn.svm.classes.NuSVC
Flow Description: Nu-Support Vector Classification.

Similar to SVC but uses a parameter to control the number of support
vectors.

The implementation is based on libsvm.
Dependencies....: sklearn==0.21.3
numpy>=1.6.1
scipy>=0.9
In [92]:
# To obtain the score (without uploading)
## r.publish() can be used to upload these results
## need to sign-in to https://www.openml.org/
score = []
evaluations = r.fold_evaluations['predictive_accuracy'][0]
for key in evaluations:
    score.append(evaluations[key])
print(np.mean(score))
0.9866666666666667