!ls
BASIC_sklearn_fact_book.ipynb PRACTICE_feature_and_ensemble.ipynb data PRACTICE_greedy_ensemble.ipynb ensemble.py PRACTICE_greedy_ensemble_r1.ipynb ensemble.pyc README.md FEATURE_by_clustering.ipynb REGULARIZATION_sklearn_comparison.ipynb FEATURE_images.ipynb tmp pca_vs_ica.png X.pkl
import ensemble
reload(ensemble)
<module 'ensemble' from 'ensemble.pyc'>
*Testing Helper Function*
reload(ensemble)
<module 'ensemble' from 'ensemble.pyc'>
## TEST create json files
ensemble._new_json_file('tmp/tmp_data.json')
!cat tmp/tmp_data.json
{}
## TEST write/read json records
ensemble._write_json_record('tmp/tmp_data.json', {'data1': {'name': 'gooddata'}}, False)
print ensemble._read_json_record('tmp/tmp_data.json')
#ensemble.write_json_record('tmp/tmp_data.json', {'data1': {'name': 'gooddata'}}, False)
ensemble._write_json_record('tmp/tmp_data.json', {'data2': {'name': 'betterata'}}, False)
print ensemble._read_json_record('tmp/tmp_data.json')
{u'data1': {u'name': u'gooddata'}} {u'data1': {u'name': u'gooddata'}, u'data2': {u'name': u'betterata'}}
## TEST delete keys
ensemble._remove_json_record('tmp/tmp_data.json', ['data1'])
print ensemble._read_json_record('tmp/tmp_data.json')
{u'data2': {u'name': u'betterata'}}
print ensemble._get_path('tmp/tmp_data.json', 'data_json')
print ensemble._get_path('tmp/tmp_data.json', 'data_folder')
print ensemble._get_path('tmp/tmp_data.json', 'models_json')
print ensemble._get_path('tmp/tmp_data.json', 'models_folder')
/home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/data.json /home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/data /home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/models.json /home/ce/workspace/tutorials/ml-tutorials/tmp/tmp_data.json/models
*Test ensemble data manipulation*
reload(ensemble)
<module 'ensemble' from 'ensemble.pyc'>
!rm -fR tmp/bigfat
ensemble.new_ensemble('bigfat', 'tmp/')
!ls tmp/bigfat
data data.json models models.json
## test write new data
import numpy as np
ensemble.write_data('tmp/bigfat/', 'X', np.array([1, 2, 3]), {'decription': 'simple array'})
!cat tmp/bigfat/data.json
{"X": {"decription": "simple array", "file": "/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl", "stored_files": ["/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl", "/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl_01.npy"]}}
## test load data
print ensemble.load_data('tmp/bigfat/', 'X')
({u'decription': u'simple array', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl', u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl', u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/data/X.pkl_01.npy']}, array([1, 2, 3]))
## test remove data
!ls tmp/bigfat/data
ensemble.remove_data('tmp/bigfat/', 'X')
X.pkl X.pkl_01.npy
!ls tmp/bigfat/data
!cat tmp/bigfat/data.json
{}
Test Model manipulation
reload(ensemble)
<module 'ensemble' from 'ensemble.pyc'>
!rm -fR tmp/bigfat/
ensemble.new_ensemble('bigfat', 'tmp')
## test write model
from sklearn.svm import LinearSVC
svc = LinearSVC()
ensemble.write_model('tmp/bigfat/', 'svc', svc, {'train_data': None,
'validation_data': None,
'test_data': None})
!ls tmp/bigfat/models
svc.pkl
!cat tmp/bigfat/models.json
{"svc": {"stored_files": ["/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl"], "test_data": null, "validation_data": null, "file": "/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl", "train_data": null}}
## TEST load model
print ensemble.load_model('tmp/bigfat/', 'svc')
({u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'test_data': None, u'validation_data': None, u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': None}, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))
## test update model record
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
iris = load_iris()
X, y = iris.data, iris.target
train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.3)
ensemble.write_data('tmp/bigfat/', 'iris_train', (train_X, train_y), {'description':'iris train data'})
ensemble.write_data('tmp/bigfat/', 'iris_validation', (validation_X, validation_y), {'description':'iris validation data'})
print ensemble.load_model('tmp/bigfat/', 'svc')
ensemble.update_model_record('tmp/bigfat/', 'svc', {'train_data': 'iris_train', 'validation_data': 'iris_validation'})
print ensemble.load_model('tmp/bigfat/', 'svc')
({u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'test_data': None, u'validation_data': None, u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': None}, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0)) ({u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'test_data': None, u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': u'iris_train'}, LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))
## TEST train model
from sklearn.metrics import classification_report
_, model_before = ensemble.load_model('tmp/bigfat/', 'svc')
ensemble.train_model('tmp/bigfat/', 'svc', 'train_data')
_, model_after = ensemble.load_model('tmp/bigfat/', 'svc')
try:
model_before.predict(X)
raise RuntimeError('exceptions not caught')
except AttributeError as e:
pass
classification_report(y, model_after.predict(X))
' precision recall f1-score support\n\n 0 1.00 1.00 1.00 50\n 1 0.94 0.96 0.95 50\n 2 0.96 0.94 0.95 50\n\navg / total 0.97 0.97 0.97 150\n'
model_name, (target, prediction) = ensemble.predict_model('tmp/bigfat/', 'svc',
'validation_data', probabilistic=False)
print target.shape
print prediction.shape
classification_report(target, prediction)
(45,) (45,)
' precision recall f1-score support\n\n 0 1.00 1.00 1.00 18\n 1 0.86 0.92 0.89 13\n 2 0.92 0.86 0.89 14\n\navg / total 0.93 0.93 0.93 45\n'
## test remove model
ensemble.remove_model('tmp/bigfat/', 'svc')
!ls tmp/bigfat/models/
remove svc from /home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models.json
!cat tmp/bigfat/models.json
{}
test parallel
reload(ensemble)
N = 1000000 chunks = 400
def get_primes(rng): def is_prime(n): import math for i in xrange(2, int(math.sqrt(n)) + 1): if n % i == 0: return False return True return [i for i in rng if is_prime(i)]
rngs = [{'rng': rng} for rng in np.array_split(range(2, N), chunks)] tasks = zip([get_primes] * len(rngs), rngs)
import numpy as np import time results = [] tic = time.time() for rng in rngs: results.append(get_primes(**rng)) print time.time()-tic, 'seconds' print len(list(flatten(results)))
tic = time.time() n = 0 n += len(get_primes(range(2, N/4))) n += len(get_primes(range(N/4, N2/4))) n += len(get_primes(range(N2/4, N3/4))) n += len(get_primes(range(N3/4, N))) print time.time() - tic, 'seconds' print n
%time print len(get_primes(range(2, N)))
from IPython import parallel client = parallel.Client() %time y = ensemble._parallel(tasks, client) import numpy as np len(list(flatten(y)))
dv = client[:] dv.block = True dv.scatter('rng', range(2, N))
%time y = ensemble._parallel([(get_primes, {'rng': parallel.Reference('rng')}) for _ in range(len(dv))], client) import numpy as np len(list(flatten(y)))
The culprit of SLOW PARALLEL COMPUTING turns out to be LARGE data transformation, so do best to load_data before running them
Test parallel model training and prediction
reload(ensemble)
ensemble_path = 'tmp/bigfat/'
from IPython import parallel
client = parallel.Client()
## create models
from sklearn import svm
from sklearn import linear_model
models = {
'linear_svc' : svm.LinearSVC()
, 'svc' : svm.SVC()
, 'sgd' : linear_model.SGDClassifier()
, 'lasso' : linear_model.Lasso()
, 'pac' : linear_model.PassiveAggressiveClassifier()
}
for model_name, model in models.items():
ensemble.write_model(ensemble_path, model_name,
model, {'train_data': 'iris_train',
'validation_data': 'iris_validation'})
## TEST parallel prediction
try:
results = ensemble.parallel_predict_model(ensemble_path,
zip(models.keys(), ['validation_data'] * len(models), [False]*len(models)),
client)
raise RuntimeError('untrained models should NOT be able to predict')
except parallel.RemoteError as ex:
if ex.ename == 'AttributeError':
pass
else:
raise ex
print ensemble._read_json_record(ensemble._get_path(ensemble_path, 'models_json'))
## TEST read model meta
print ''
print ensemble.read_model_meta(ensemble_path, 'lasso', ['file', 'is_probabilistic', 'train_data'])
{u'lasso': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/lasso.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/lasso.pkl', u'train_data': u'iris_train'}, u'pac': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/pac.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/pac.pkl', u'train_data': u'iris_train'}, u'sgd': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/sgd.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/sgd.pkl', u'train_data': u'iris_train'}, u'linear_svc': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/linear_svc.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/linear_svc.pkl', u'train_data': u'iris_train'}, u'svc': {u'stored_files': [u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl'], u'validation_data': u'iris_validation', u'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/svc.pkl', u'train_data': u'iris_train'}} {'is_probabilistic': None, 'file': u'/home/ce/workspace/tutorials/ml-tutorials/tmp/bigfat/models/lasso.pkl', 'train_data': u'iris_train'}
## TEST parallel training
ensemble.paralle_train_models(ensemble_path,
zip(models.keys(), ['train_data'] * len(models)), client)
## TEST parallel prediction
results = ensemble.parallel_predict_model(ensemble_path,
zip(models.keys(), ['validation_data'] * len(models), [False]*len(models)),
client)
from pprint import pprint
pprint(results)
[['pac', (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), array([1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1]))], ['lasso', (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), array([ 1.27931693, 1.17098383, 1.17098383, 0.77015139, 1.0193175 , 0.80265131, 1.13848391, 0.75931808, 0.791818 , 1.18181714, 1.23598369, 0.791818 , 1.22515038, 0.75931808, 1.13848391, 1.17098383, 0.80265131, 0.77015139, 1.05181743, 0.791818 , 1.21431707, 1.07348405, 0.94348434, 1.03015081, 1.07348405, 0.791818 , 1.17098383, 1.17098383, 1.10598398, 1.31181686, 0.78098469, 1.08431736, 0.77015139, 0.82431793, 1.08431736, 0.80265131, 0.75931808, 0.77015139, 0.78098469, 1.09515067, 0.99765088, 0.78098469, 1.14931721, 1.22515038, 1.23598369]))], ['svc', (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]))], ['linear_svc', (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), array([2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]))], ['sgd', (array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), array([2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]))]]
TEST GreedyEnsemble Class
reload(ensemble)
<module 'ensemble' from 'ensemble.pyc'>
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn=None, votefn=None)
models = ['lasso', 'pac', 'svc', 'sgd', 'linear_svc']
## test _predict_by_model
target, predictions = ge._predict_by_model(models, data_type='validation_data')
print predictions
print target.shape
{'pac': array([1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1]), 'sgd': array([2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'svc': array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'linear_svc': array([2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'lasso': array([ 1.27931693, 1.17098383, 1.17098383, 0.77015139, 1.0193175 , 0.80265131, 1.13848391, 0.75931808, 0.791818 , 1.18181714, 1.23598369, 0.791818 , 1.22515038, 0.75931808, 1.13848391, 1.17098383, 0.80265131, 0.77015139, 1.05181743, 0.791818 , 1.21431707, 1.07348405, 0.94348434, 1.03015081, 1.07348405, 0.791818 , 1.17098383, 1.17098383, 1.10598398, 1.31181686, 0.78098469, 1.08431736, 0.77015139, 0.82431793, 1.08431736, 0.80265131, 0.75931808, 0.77015139, 0.78098469, 1.09515067, 0.99765088, 0.78098469, 1.14931721, 1.22515038, 1.23598369])} (45,)
## test _greedy_search
from sklearn import metrics
scorefn = metrics.accuracy_score
votefn = ensemble.GreedyEnsemble.vote_major_class
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)
ge.fit(['sgd', 'svc', 'linear_svc'], data_type='validation_data', verbose = True)
print ge.ensemble_
checking model svc improvement from 0.0 to 0.977777777778 ['svc']
ge._predict_by_model(['sgd', 'svc', 'linear_svc'], data_type='validation_data')
(array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), {'linear_svc': array([2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 2, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'sgd': array([2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2]), 'svc': array([2, 2, 2, 0, 1, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 1, 0, 0, 1, 0, 2, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2])})
## test partial_fit
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)
models = ['sgd', 'svc', 'linear_svc', 'pac']
data_type = 'validation_data'
print ge.ensemble_
[]
for model in models:
ge.partial_fit([model], data_type=data_type, verbose = True)
checking model sgd improvement from 0.0 to 0.844444444444 checking model svc improvement from 0.844444444444 to 0.844444444444 checking model linear_svc improvement from 0.844444444444 to 0.955555555556
print ge.ensemble_
['sgd', 'svc', 'linear_svc']
## test predict
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)
models = ['sgd', 'svc', 'linear_svc', 'pac']
data_type = 'validation_data'
for model in models:
ge.partial_fit([model], data_type=data_type, verbose=True)
print ge.predict(data_type)
checking model sgd improvement from 0.0 to 0.844444444444 [2 1 2 0 1 0 1 0 0 1 2 0 1 0 1 1 0 0 1 0 1 1 1 1 1 0 2 2 1 1 0 1 0 0 1 0 0 0 0 1 1 0 1 2 2] checking model svc improvement from 0.844444444444 to 0.844444444444 [2 1 2 0 1 0 1 0 0 1 2 0 1 0 1 1 0 0 1 0 1 1 1 1 1 0 2 2 1 1 0 1 0 0 1 0 0 0 0 1 1 0 1 2 2] checking model linear_svc improvement from 0.844444444444 to 0.955555555556 [2 2 2 0 1 0 1 0 0 2 2 0 1 0 2 1 0 0 1 0 2 1 1 1 1 0 2 2 1 2 0 1 0 0 1 0 0 0 0 1 1 0 1 2 2] [2 2 2 0 1 0 1 0 0 2 2 0 1 0 2 1 0 0 1 0 2 1 1 1 1 0 2 2 1 2 0 1 0 0 1 0 0 0 0 1 1 0 1 2 2]
## test score
ge = ensemble.GreedyEnsemble(ensemble_path, scorefn, votefn)
models = ['sgd', 'svc', 'linear_svc', 'pac']
data_type = 'validation_data'
for model in models:
ge.partial_fit([model], data_type=data_type, verbose=True)
print ge.score(data_type)
print ge.ensemble_
checking model sgd improvement from 0.0 to 0.844444444444 0.844444444444 checking model svc improvement from 0.844444444444 to 0.844444444444 0.844444444444 checking model linear_svc improvement from 0.844444444444 to 0.955555555556 0.955555555556 0.955555555556 ['sgd', 'svc', 'linear_svc']
Putting it all together
## TEST ensemble with digits and other data
!ls data
blackbox_ensemble blackbox.pkl digits.pkl icml2013-blackbox MNIST tmp
import cPickle
blackbox = cPickle.load(open('data/digits.pkl', 'rb'))
X, y = blackbox
print X.shape, y.shape
(42000, 784) (42000,)
from sklearn.cross_validation import train_test_split
train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.2)
print train_X.shape, validation_X.shape
print train_y.shape, validation_y.shape
(33600, 784) (8400, 784) (33600,) (8400,)
## make new ensemble
reload(ensemble)