In [1]:

!ls

BASIC_sklearn_fact_book.ipynb  PRACTICE_feature_and_ensemble.ipynb
data			       PRACTICE_greedy_ensemble.ipynb
ensemble.py		       PRACTICE_greedy_ensemble_r1.ipynb
ensemble.pyc		       README.md
FEATURE_by_clustering.ipynb    REGULARIZATION_sklearn_comparison.ipynb
FEATURE_images.ipynb	       tmp
pca_vs_ica.png		       X.pkl

In [2]:

import ensemble
reload(ensemble)

Out[2]:

<module 'ensemble' from 'ensemble.pyc'>

*Testing Helper Function*

In [3]:

reload(ensemble)

Out[3]:

<module 'ensemble' from 'ensemble.pyc'>

In [4]:

## TEST create json files
ensemble._new_json_file('tmp/tmp_data.json')
!cat tmp/tmp_data.json

{}

In [5]:

## TEST write/read json records
ensemble._write_json_record('tmp/tmp_data.json', {'data1': {'name': 'gooddata'}}, False)
print ensemble._read_json_record('tmp/tmp_data.json')
#ensemble.write_json_record('tmp/tmp_data.json', {'data1': {'name': 'gooddata'}}, False)
ensemble._write_json_record('tmp/tmp_data.json', {'data2': {'name': 'betterata'}}, False)
print ensemble._read_json_record('tmp/tmp_data.json')

{u'data1': {u'name': u'gooddata'}}
{u'data1': {u'name': u'gooddata'}, u'data2': {u'name': u'betterata'}}

In [6]:

## TEST delete keys
ensemble._remove_json_record('tmp/tmp_data.json', ['data1'])
print ensemble._read_json_record('tmp/tmp_data.json')

{u'data2': {u'name': u'betterata'}}

In [7]:

print ensemble._get_path('tmp/tmp_data.json', 'data_json')
print ensemble._get_path('tmp/tmp_data.json', 'data_folder')
print ensemble._get_path('tmp/tmp_data.json', 'models_json')
print ensemble._get_path('tmp/tmp_data.json', 'models_folder')

/home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/data.json
/home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/data
/home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/models.json
/home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/models

*Test ensemble data manipulation*

In [8]:

reload(ensemble)

Out[8]:

<module 'ensemble' from 'ensemble.pyc'>

In [9]:

!rm -fR tmp/bigfat
ensemble.new_ensemble('bigfat', 'tmp/')
!ls tmp/bigfat

data  data.json  models  models.json

In [10]:

## test write new data
import numpy as np
ensemble.write_data('tmp/bigfat/', 'X', np.array([1, 2, 3]), {'decription': 'simple array'})
!cat tmp/bigfat/data.json

{"X": {"decription": "simple array", "file": "/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl", "stored_files": ["/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl", "/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl_01.npy"]}}

In [11]:

## test load data
print ensemble.load_data('tmp/bigfat/', 'X')

({u'decription': u'simple array', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl', u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl', u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl_01.npy']}, array([1, 2, 3]))

In [12]:

## test remove data
!ls tmp/bigfat/data
ensemble.remove_data('tmp/bigfat/', 'X')

X.pkl  X.pkl_01.npy

In [13]:

!ls tmp/bigfat/data

In [14]:

!cat tmp/bigfat/data.json

{}

Test Model manipulation

In [15]:

reload(ensemble)

Out[15]:

<module 'ensemble' from 'ensemble.pyc'>

In [16]:

!rm -fR tmp/bigfat/
ensemble.new_ensemble('bigfat', 'tmp')

In [17]:

## test write model
from sklearn.svm import LinearSVC
svc = LinearSVC()
ensemble.write_model('tmp/bigfat/', 'svc', svc, {'train_data': None, 
                                                'validation_data': None, 
                                                'test_data': None})

In [18]:

!ls tmp/bigfat/models

svc.pkl

In [19]:

!cat tmp/bigfat/models.json

{"svc": {"stored_files": ["/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl"], "test_data": null, "validation_data": null, "file": "/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl", "train_data": null}}

In [20]:

## TEST load model
print ensemble.load_model('tmp/bigfat/', 'svc')

({u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'test_data': None, u'validation_data': None, u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': None}, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))

In [21]:

## test update model record
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
iris = load_iris()
X, y = iris.data, iris.target
train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.3)
ensemble.write_data('tmp/bigfat/', 'iris_train', (train_X, train_y), {'description':'iris train data'})
ensemble.write_data('tmp/bigfat/', 'iris_validation', (validation_X, validation_y), {'description':'iris validation data'})

print ensemble.load_model('tmp/bigfat/', 'svc')
ensemble.update_model_record('tmp/bigfat/', 'svc', {'train_data': 'iris_train', 'validation_data': 'iris_validation'})
print ensemble.load_model('tmp/bigfat/', 'svc')

({u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'test_data': None, u'validation_data': None, u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': None}, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))
({u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'test_data': None, u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': u'iris_train'}, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0))

In [22]:

## TEST train model
from sklearn.metrics import classification_report
_, model_before = ensemble.load_model('tmp/bigfat/', 'svc')
ensemble.train_model('tmp/bigfat/', 'svc', 'train_data')
_, model_after = ensemble.load_model('tmp/bigfat/', 'svc')
try:
    model_before.predict(X)
    raise RuntimeError('exceptions not caught')
except AttributeError as e:
    pass

classification_report(y, model_after.predict(X))

Out[22]:

'             precision    recall  f1-score   support\n\n          0       1.00      1.00      1.00        50\n          1       0.94      0.96      0.95        50\n          2       0.96      0.94      0.95        50\n\navg / total       0.97      0.97      0.97       150\n'

In [23]:

model_name, (target, prediction) = ensemble.predict_model('tmp/bigfat/', 'svc', 
                    'validation_data', probabilistic=False)
print target.shape
print prediction.shape

classification_report(target, prediction)

(45,)
(45,)

Out[23]:

'             precision    recall  f1-score   support\n\n          0       1.00      1.00      1.00        18\n          1       0.86      0.92      0.89        13\n          2       0.92      0.86      0.89        14\n\navg / total       0.93      0.93      0.93        45\n'

In [24]:

## test remove model
ensemble.remove_model('tmp/bigfat/', 'svc')
!ls tmp/bigfat/models/

remove  svc from  /home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models.json

In [25]:

!cat tmp/bigfat/models.json

{}

test parallel

reload(ensemble)

N = 1000000 chunks = 400

def get_primes(rng): def is_prime(n): import math for i in xrange(2, int(math.sqrt(n)) + 1): if n % i == 0: return False return True return [i for i in rng if is_prime(i)]

rngs = [{'rng': rng} for rng in np.array_split(range(2, N), chunks)] tasks = zip([get_primes] * len(rngs), rngs)

import numpy as np import time results = [] tic = time.time() for rng in rngs: results.append(get_primes(**rng)) print time.time()-tic, 'seconds' print len(list(flatten(results)))

tic = time.time() n = 0 n += len(get_primes(range(2, N/4))) n += len(get_primes(range(N/4, N2/4))) n += len(get_primes(range(N2/4, N3/4))) n += len(get_primes(range(N3/4, N))) print time.time() - tic, 'seconds' print n

%time print len(get_primes(range(2, N)))

from IPython import parallel client = parallel.Client() %time y = ensemble._parallel(tasks, client) import numpy as np len(list(flatten(y)))

dv = client[:] dv.block = True dv.scatter('rng', range(2, N))

%time y = ensemble._parallel([(get_primes, {'rng': parallel.Reference('rng')}) for _ in range(len(dv))], client) import numpy as np len(list(flatten(y)))

The culprit of SLOW PARALLEL COMPUTING turns out to be LARGE data transformation, so do best to load_data before running them

Test parallel model training and prediction

In [26]:

reload(ensemble)
ensemble_path = 'tmp/bigfat/'
from IPython import parallel
client = parallel.Client()

In [27]:

## create models
from sklearn import svm
from sklearn import linear_model
models = {
    'linear_svc' : svm.LinearSVC()
    , 'svc' : svm.SVC()
    , 'sgd' : linear_model.SGDClassifier()
    , 'lasso' : linear_model.Lasso()
    , 'pac' : linear_model.PassiveAggressiveClassifier()
}
for model_name, model in models.items():
    ensemble.write_model(ensemble_path, model_name, 
                            model, {'train_data': 'iris_train', 
                                    'validation_data': 'iris_validation'})

In [28]:

## TEST parallel prediction
try:
    results = ensemble.parallel_predict_model(ensemble_path, 
        zip(models.keys(), ['validation_data'] * len(models), [False]*len(models)),
        client)
    raise RuntimeError('untrained models should NOT be able to predict')
except parallel.RemoteError as ex:
    if ex.ename == 'AttributeError':
        pass
    else:
        raise ex

In [29]:

print ensemble._read_json_record(ensemble._get_path(ensemble_path, 'models_json'))
## TEST read model meta
print ''
print ensemble.read_model_meta(ensemble_path, 'lasso', ['file', 'is_probabilistic', 'train_data'])

{u'lasso': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/lasso.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/lasso.pkl', u'train_data': u'iris_train'}, u'pac': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/pac.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/pac.pkl', u'train_data': u'iris_train'}, u'sgd': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/sgd.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/sgd.pkl', u'train_data': u'iris_train'}, u'linear_svc': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/linear_svc.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/linear_svc.pkl', u'train_data': u'iris_train'}, u'svc': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': u'iris_train'}}

{'is_probabilistic': None, 'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/lasso.pkl', 'train_data': u'iris_train'}

In [30]:

## TEST parallel training

ensemble.paralle_train_models(ensemble_path, 
        zip(models.keys(), ['train_data'] * len(models)), client)

In [31]:

## TEST parallel prediction
results = ensemble.parallel_predict_model(ensemble_path, 
        zip(models.keys(), ['validation_data'] * len(models), [False]*len(models)),
        client)

In [32]:

from pprint import pprint
pprint(results)

[['pac',
  (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
   array([1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1]))],
 ['lasso',
  (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
   array([ 1.27931693,  1.17098383,  1.17098383,  0.77015139,  1.0193175 ,
        0.80265131,  1.13848391,  0.75931808,  0.791818  ,  1.18181714,
        1.23598369,  0.791818  ,  1.22515038,  0.75931808,  1.13848391,
        1.17098383,  0.80265131,  0.77015139,  1.05181743,  0.791818  ,
        1.21431707,  1.07348405,  0.94348434,  1.03015081,  1.07348405,
        0.791818  ,  1.17098383,  1.17098383,  1.10598398,  1.31181686,
        0.78098469,  1.08431736,  0.77015139,  0.82431793,  1.08431736,
        0.80265131,  0.75931808,  0.77015139,  0.78098469,  1.09515067,
        0.99765088,  0.78098469,  1.14931721,  1.22515038,  1.23598369]))],
 ['svc',
  (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
   array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]))],
 ['linear_svc',
  (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
   array([2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]))],
 ['sgd',
  (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
   array([2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 2, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]))]]

TEST GreedyEnsemble Class

In [46]:

reload(ensemble)

Out[46]:

<module 'ensemble' from 'ensemble.pyc'>

In [47]:

ge = ensemble.GreedyEnsemble(ensemble_path, scorefn=None, votefn=None)
models = ['lasso', 'pac', 'svc', 'sgd', 'linear_svc']

In [48]:

## test _predict_by_model
target, predictions = ge._predict_by_model(models, data_type='validation_data')
print predictions
print target.shape

{'pac': array([1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1]), 'sgd': array([2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 2, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'svc': array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'linear_svc': array([2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'lasso': array([ 1.27931693,  1.17098383,  1.17098383,  0.77015139,  1.0193175 ,
        0.80265131,  1.13848391,  0.75931808,  0.791818  ,  1.18181714,
        1.23598369,  0.791818  ,  1.22515038,  0.75931808,  1.13848391,
        1.17098383,  0.80265131,  0.77015139,  1.05181743,  0.791818  ,
        1.21431707,  1.07348405,  0.94348434,  1.03015081,  1.07348405,
        0.791818  ,  1.17098383,  1.17098383,  1.10598398,  1.31181686,
        0.78098469,  1.08431736,  0.77015139,  0.82431793,  1.08431736,
        0.80265131,  0.75931808,  0.77015139,  0.78098469,  1.09515067,
        0.99765088,  0.78098469,  1.14931721,  1.22515038,  1.23598369])}
(45,)

In [49]:

## test _greedy_search
from sklearn import metrics
scorefn = metrics.accuracy_score
votefn = ensemble.GreedyEnsemble.vote_major_class

In [50]:

ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)

In [51]:

ge.fit(['sgd', 'svc', 'linear_svc'], data_type='validation_data', verbose = True)
print ge.ensemble_

checking model svc improvement from  0.0 to 0.977777777778
['svc']

In [52]:

ge._predict_by_model(['sgd', 'svc', 'linear_svc'], data_type='validation_data')

Out[52]:

(array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
 {'linear_svc': array([2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
  'sgd': array([2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 2, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]),
  'svc': array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1,
       1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2])})

In [53]:

## test partial_fit
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)
models = ['sgd', 'svc', 'linear_svc', 'pac']
data_type = 'validation_data'
print ge.ensemble_

[]

In [54]:

for model in models:
    ge.partial_fit([model], data_type=data_type, verbose = True)

checking model sgd improvement from  0.0 to 0.844444444444
checking model svc improvement from  0.844444444444 to 0.844444444444
checking model linear_svc improvement from  0.844444444444 to 0.955555555556

In [55]:

print ge.ensemble_

['sgd', 'svc', 'linear_svc']

In [56]:

## test predict
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)
models = ['sgd', 'svc', 'linear_svc', 'pac']
data_type = 'validation_data'

In [57]:

for model in models:
    ge.partial_fit([model], data_type=data_type, verbose=True)
    print ge.predict(data_type)

checking model sgd improvement from  0.0 to 0.844444444444
[2 1 2 0 1 0 1 0 0 1 2 0 1 0 1 1 0 0 1 0 1 1 1 1 1 0 2 2 1 1 0 1 0 0 1 0 0
 0 0 1 1 0 1 2 2]
checking model svc improvement from  0.844444444444 to 0.844444444444
[2 1 2 0 1 0 1 0 0 1 2 0 1 0 1 1 0 0 1 0 1 1 1 1 1 0 2 2 1 1 0 1 0 0 1 0 0
 0 0 1 1 0 1 2 2]
checking model linear_svc improvement from  0.844444444444 to 0.955555555556
[2 2 2 0 1 0 1 0 0 2 2 0 1 0 2 1 0 0 1 0 2 1 1 1 1 0 2 2 1 2 0 1 0 0 1 0 0
 0 0 1 1 0 1 2 2]
[2 2 2 0 1 0 1 0 0 2 2 0 1 0 2 1 0 0 1 0 2 1 1 1 1 0 2 2 1 2 0 1 0 0 1 0 0
 0 0 1 1 0 1 2 2]

In [59]:

## test score
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)
models = ['sgd', 'svc', 'linear_svc', 'pac']
data_type = 'validation_data'
for model in models:
    ge.partial_fit([model], data_type=data_type, verbose=True)
    print ge.score(data_type)
print ge.ensemble_

checking model sgd improvement from  0.0 to 0.844444444444
0.844444444444
checking model svc improvement from  0.844444444444 to 0.844444444444
0.844444444444
checking model linear_svc improvement from  0.844444444444 to 0.955555555556
0.955555555556
0.955555555556
['sgd', 'svc', 'linear_svc']

Putting it all together

In [61]:

## TEST ensemble with digits and other data
!ls data

blackbox_ensemble  blackbox.pkl  digits.pkl  icml2013-blackbox	MNIST  tmp

In [65]:

import cPickle
blackbox = cPickle.load(open('data/digits.pkl', 'rb'))
X, y = blackbox
print X.shape, y.shape

(42000, 784) (42000,)

In [66]:

from sklearn.cross_validation import train_test_split
train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.2)
print train_X.shape, validation_X.shape
print train_y.shape, validation_y.shape

(33600, 784) (8400, 784)
(33600,) (8400,)

In [ ]:

## make new ensemble
reload(ensemble)