!ls data
blackbox_ensemble blackbox.pkl
import cPickle
import numpy as np
from sklearn import preprocessing
import matplotlib.pylab as plt
import os
from os import path
import shutil
from sklearn.externals import joblib
import json
import copy
## load data
blackbox = cPickle.load(open('data/blackbox.pkl', 'rb'))
X, y = blackbox
print X.shape
print y.shape
plt.plot(np.mean(X, axis = 0), label='X.mean')
plt.plot(np.std(X, axis = 0), label='X.std')
plt.legend(loc='best')
(1000, 1875) (1000,)
<matplotlib.legend.Legend at 0x3452110>
## scaled version
ScaledX = preprocessing.StandardScaler().fit_transform(X)
plt.plot(np.mean(ScaledX, axis = 0), label='ScaledX.mean')
plt.plot(np.std(ScaledX, axis = 0), label='ScaledX.std')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x2e8a5d0>
## force to put a structure onto the ensemble folder
## some house keeping work such as creating sub-folders, persisting different data, reading/writing configurations
## start with the simple function interface, keep growing the interface when it is getting more complicated
## create folder for ensemble
def make_ensemble(name, context_folder):
ensemble_path = path.abspath(path.join(context_folder, name))
print "DEBUG: ", ensemble_path
try:
os.mkdir(ensemble_path)
os.mkdir(path.join(ensemble_path, 'data'))
os.mkdir(path.join(ensemble_path, 'models'))
with open(path.join(ensemble_path, 'models.json'), 'wb') as f:
f.write('{}')
with open(path.join(ensemble_path, 'data.json'), 'wb') as f:
f.write('{}')
return (name, ensemble_path)
except Exception, ex:
print 'CREATING FOLDER ERROR: ensemble path', ensemble_path, 'could have existed'
raise ex
return None, None
## destroy the whole ensemble folder
def remove_ensemble(folder):
shutil.rmtree(path.abspath(folder))
print 'REMOVE ENSEMBLE FOLDER:', path.abspath(folder)
## TEST creating ensemble model
!ls
ensemble_name, ensemble_path = make_ensemble('blackbox_ensemble', './data')
#print make_ensemble('blackbox_ensemble', './data')
!tree data/blackbox_ensemble
!cat data/blackbox_ensemble/data.json
remove_ensemble(ensemble_path)
!tree blackbox_ensemble
BASIC_sklearn_fact_book.ipynb PRACTICE_feature_and_ensemble.ipynb data PRACTICE_greedy_ensemble.ipynb FEATURE_by_clustering.ipynb README.md FEATURE_images.ipynb REGULARIZATION_sklearn_comparison.ipynb pca_vs_ica.png
--------------------------------------------------------------------------- OSError Traceback (most recent call last) <ipython-input-6-b455a8d661fe> in <module>() 1 ## TEST creating ensemble model 2 get_ipython().system(u'ls') ----> 3 ensemble_name, ensemble_path = make_ensemble('blackbox_ensemble', './data') 4 #print make_ensemble('blackbox_ensemble', './data') 5 get_ipython().system(u'tree data/blackbox_ensemble') <ipython-input-5-57d4fff2e81a> in make_ensemble(name, context_folder) 18 except Exception, ex: 19 print 'CREATING FOLDER ERROR: ensemble path', ensemble_path, 'could have existed' ---> 20 raise ex 21 return None, None 22 OSError: [Errno 17] File exists: '/home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble'
DEBUG: /home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble CREATING FOLDER ERROR: ensemble path /home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble could have existed
## HELPER function add a json record to an existing json based configuration file
## the format of existing configuration file should be a dictionary, keyed by string, valued by any structure
def add_json_record(json_file, records, overwrite = False):
"""
PARAM json_file: the path of the configuration json file - dict
PARAM record: a (key_str, any_value) tuple in python, to be inserted in the json file
PARAM overwrite: whether overwrite an existing key or throw an exception
"""
json_path = path.abspath(json_file)
configure = json.load(open(json_path, 'rb'))
for (key, value) in records.items():
if not overwrite and key in configure:
#raise RuntimeError(' '.join(['key', key, 'already in the configuration file', json_path]))
print 'IGNORED:', ' '.join(['key', key, 'already in the configuration file', json_path])
else:
configure[key] = value
json.dump(configure, open(json_path, 'wb'))
def remove_json_record(json_file, key):
json_path = path.abspath(json_file)
configure = json.load(open(json_path, 'rb'))
del configure[key]
json.dump(configure, open(json_path, 'wb'))
def read_json_record(json_file, key):
json_path = path.abspath(json_file)
try:
return json.load(open(json_path, 'rb'))[key]
except:
return None
## TEST with add_json_record & remove_json_record
#make_ensemble('blackbox_ensemble', context_folder = './data')
add_json_record('data/blackbox_ensemble/data.json', {'dumb': {'pi': 3.14, 'hi': 'je;'}})
#!cat data/blackbox_ensemble/data.json
print read_json_record('data/blackbox_ensemble/data.json', 'dumb')
remove_json_record('data/blackbox_ensemble/data.json', 'dumb')
print read_json_record('data/blackbox_ensemble/data.json', 'dumb')
{u'pi': 3.14, u'hi': u'je;'} None
## persist data to data folder and record it in data.json configuration
## later different models can load the data in a shared-memory manner
def persist_data(ensemble_folder, data_name, data, description,
compress_level = 9, overwrite=False):
"""
Write data to joblib shared-memroy file
"""
## write data to file
data_file = data_name + '.pkl'
data_path = path.abspath(path.join(ensemble_folder, 'data', data_file))
store_files = joblib.dump(data, data_path, compress=compress_level)
store_files = map(path.abspath, store_files)
## write record to config
config_file = path.abspath(path.join(ensemble_folder, 'data.json'))
config_record = {
data_name: { 'name': data_name
, 'file': data_path
, 'stores': store_files
, 'description': description}
}
add_json_record(config_file, config_record, overwrite=overwrite)
def remove_data(ensemble_folder, data_name):
config_file = path.abspath(path.join(ensemble_folder, 'data.json'))
config_record = read_json_record(config_file, data_name)
## delete data file
if config_record:
stores = config_record['stores']
for f in stores:
os.remove(f)
else:
#raise RuntimeError('No record in config called ' + data_name)
print 'IGNORED: ', 'No record in config called ' + data_name
## remove configuration record
remove_json_record(config_file, data_name)
def load_data(ensemble_folder, data_name):
## load config record
config_file = path.abspath(path.join(ensemble_folder, 'data.json'))
config_record = read_json_record(config_file, data_name)
## load real data
data = joblib.load(config_record['file'])
config_record['data'] = data
return config_record
## TEST persist_data
#remove_data('data/blackbox_ensemble/', 'iris')
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
#persist_data('data/blackbox_ensemble/', 'iris', ((X, y), iris), 'iris data')
persist_data('data/blackbox_ensemble/', data_name='iris', data=np.array([1, 2, 3, None]), description='iris data')
print load_data('data/blackbox_ensemble', 'iris')
!ls data/blackbox_ensemble/data/
remove_data('data/blackbox_ensemble/', 'iris')
!ls data/blackbox_ensemble/data/
{'data': array([1, 2, 3, None], dtype=object), u'stores': [u'/home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble/data/iris.pkl'], u'name': u'iris', u'file': u'/home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble/data/iris.pkl', u'description': u'iris data'} iris.pkl
## persist a model
def persist_model(ensemble_folder, model_name, model,
train_data = None, validation_data = None, test_data = None,
description = '', overwrite=False, **args):
"""
PARAM ensemble_folder: path to ensemble root folder
PARAM model_name: the key of model in models.json
PARAM model: the persistable model
PARAM train_data: name tuple (X, y) of data used for training or None
PARAM valid_data: name tuple (X, y) of data used for validation or None
PARAM test_data: name tuple (X, y) of data used for testing or None, y could be None
"""
config_file = path.abspath(path.join(ensemble_folder, 'models.json'))
model_path = path.abspath(path.join(ensemble_folder, 'models', model_name+'.pkl'))
store_files = joblib.dump(model, model_path)
model_record = {model_name: {
'name': model_name
, 'file': model_path
, 'stores': store_files
, 'description': description
, 'train_data': train_data
, 'validation_data': validation_data
, 'test_data': test_data
}
}
add_json_record(config_file, model_record, overwrite=overwrite)
def remove_model(ensemble_folder, model_name):
config_file = path.abspath(path.join(ensemble_folder, 'models.json'))
config_record = read_json_record(config_file, model_name)
if config_record:
## delete all model files
stores = config_record['stores']
for f in stores:
os.remove(f)
## remove the configuration record
remove_json_record(config_file, model_name)
else:
#raise RuntimeError('no record found in ', model_name)
print 'IGNORED: ', 'No record in config called ' + model_name
def load_model(ensemble_folder, model_name):
## load configuration record
config_file = path.abspath(path.join(ensemble_folder, 'models.json'))
config_record = read_json_record(config_file, model_name)
## load real model file and populate to the record
model = joblib.load(config_record['file'])
config_record['model'] = model
return config_record
def update_model(ensemble_folder, model_name, new_config):
model_config = load_model(ensemble_folder, model_name)
model_config.update(new_config)
remove_model(ensemble_folder, model_name)
persist_model(ensemble_folder, model_name, overwrite=True, **model_config)
def copy_model(ensemble_folder, from_model, to_model):
## load model configure
## copy model pickle and
## TODO: REALLY NECESSARY?
raise RuntimeError('NOT IMPLEMENTED')
## TEST model persistance
from sklearn.datasets import load_digits
from sklearn import svm
from sklearn import cross_validation
digits = load_digits()
X, y = digits.data, digits.target
train_X, valid_X, train_y, valid_y = cross_validation.train_test_split(X, y, test_size = 0.3, )
## persist data
persist_data('./data/blackbox_ensemble/', 'digits_train', (train_X, train_y), "digits train set")
persist_data('./data/blackbox_ensemble/', 'digits_valid', (valid_X, valid_y), 'digits valid set')
## TEST train and persist model
svc = svm.SVC()
#svc.fit(X, y)
persist_model('data/blackbox_ensemble/', 'digits_svc', svc, train_data = 'digits_train', )
print load_model('data/blackbox_ensemble/', 'digits_svc')
update_model('data/blackbox_ensemble/', 'digits_svc', {'description': 'UPATED MODEL', 'validation_data': 'digits_valid'})
print load_model('data/blackbox_ensemble/', 'digits_svc')
#remove_model('data/blackbox_ensemble/', 'digits_svc')
!ls data/blackbox_ensemble/models
{u'stores': [u'/home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble/models/digits_svc.pkl'], u'name': u'digits_svc', u'validation_data': None, u'test_data': None, u'file': u'/home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble/models/digits_svc.pkl', 'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, shrinking=True, tol=0.001, verbose=False), u'train_data': u'digits_train', u'description': u''} {u'stores': [u'/home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble/models/digits_svc.pkl'], u'name': u'digits_svc', u'validation_data': u'digits_valid', u'test_data': None, u'file': u'/home/dola/workspace/tutorials/ml-tutorials/data/blackbox_ensemble/models/digits_svc.pkl', 'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, shrinking=True, tol=0.001, verbose=False), u'train_data': u'digits_train', u'description': u'UPATED MODEL'} digits_svc.pkl
## fit model on data
def fit_model(ensemble_folder, model_name, data_name):
"""
PARAM ensemble_folder: the root folder of the ensemble
PARAM model_name: the key (string) of the model
PARAM dataset: type of data set on which the model is fit {'train_data', 'validation_data', 'test_data'}
"""
## load model and data
model_config = load_model(ensemble_folder, model_name)
#if dataset in ['train_data', 'validation_data', 'test_data']:
# data_name = model_config[dataset]
#else:
# raise RuntimeError("NOT RECOGNIZED %s is NOT ONE OF {'train_data', 'validation_data', 'test_data'}" % dataset )
model = model_config['model']
data_config = load_data(ensemble_folder, data_name)
X, y = data_config['data']
## fit model to data
model.fit(X, y)
model_config['model'] = model
model_config['train_data'] = data_name
## persistant updated model back
update_model(ensemble_folder, model_name, model_config)
return model
## predict by model
def predict_model(ensemble_folder, model_name, data_name, probability=True):
"""
PARAM probabilit: use predict_proba or predict for prediction
RETURN (target, prediction) for the dataset, target could be None
"""
model_config = load_model(ensemble_folder, model_name)
model = model_config['model']
#data_name = model_config[dataset]
X, y = load_data(ensemble_folder, data_name)['data']
if probability:
yhat = model.predict_proba(X)
else:
yhat = model.predict(X)
return (y, yhat)
## TODO - evaluate model's performance
## TEST fit model
## train on train data and predict on validation data
from sklearn import metrics
fit_model('data/blackbox_ensemble/', 'digits_svc', data_name='digits_train')
(y, yhat) = predict_model('data/blackbox_ensemble/', 'digits_svc', data_name='digits_valid', probability=False)
print metrics.classification_report(y, yhat)
## train on validatoin data and update model
## predict on validation data
fit_model('data/blackbox_ensemble/', 'digits_svc', data_name='digits_valid')
(y, yhat) = predict_model('data/blackbox_ensemble/', 'digits_svc', data_name='digits_valid', probability=False)
print metrics.classification_report(y, yhat)
precision recall f1-score support 0 1.00 0.55 0.71 56 1 1.00 0.39 0.56 51 2 1.00 0.77 0.87 47 3 1.00 0.44 0.61 55 4 1.00 0.43 0.60 54 5 1.00 0.33 0.50 57 6 1.00 0.50 0.67 60 7 1.00 0.27 0.42 60 8 1.00 0.04 0.08 51 9 0.14 1.00 0.25 49 avg / total 0.92 0.46 0.53 540 precision recall f1-score support 0 1.00 1.00 1.00 56 1 1.00 1.00 1.00 51 2 1.00 1.00 1.00 47 3 1.00 1.00 1.00 55 4 1.00 1.00 1.00 54 5 1.00 1.00 1.00 57 6 1.00 1.00 1.00 60 7 1.00 1.00 1.00 60 8 1.00 1.00 1.00 51 9 1.00 1.00 1.00 49 avg / total 1.00 1.00 1.00 540
/usr/lib/python2.7/dist-packages/numpy/lib/utils.py:1132: DeprecationWarning: The compiler package is deprecated and removed in Python 3.x. import compiler
## massively train models on data in parallel
## THE PROCESS OF MAKING INDIVIDUAL MODELS FOR ENSEMBLE IS AS FOLLOWS:
## 1. BUILD DIFFERENT TYPES OF DATA SET, e.g. X, y separately, and later put them together
## 2. PLAY WITH DIFFERENT TYPES OF MODELS ON DATASETS - FIND THE REASONAL RANGE OF PARAMETERS FOR DIFFERENT MODELS
## 3. TRAIN DIFFERENT MODELS WITH PARAMETER CONFIGURATIONS PARALLELLY AND PERSIST THEM
def fit_persist_model(ensemble_folder, model_config, data_config):
"""
PARAM ensemble_folder: root folder of the whole ensemble
PARAM model_config: dict of { 'model_name': key_in_model_config
, 'model_template': template_of_model - a class
, 'meta_params': parameter_to_model
, 'description': description_of_model}
PARAM data_config: dict of { 'train_data': key_of_train_data_in_data_config
[, 'validation_data': key_of_validation_data_OPTIONAL]
[, 'test_data': key_of_test_data_OPTIONAL]}
EFFECT: build model based on template, set meta_parameters to model, and fit it on trian_data
persist the model on the ensemble_folder
"""
## build model, load train_data, and set params and fit
model = model_config['model_template']()
train_X, train_y = load_data(ensemble_folder, data_config['train_data'])['data']
model.set_params(**model_config['meta_params'])
model.fit(train_X, train_y)
## persist model with configuration
persist_model(ensemble_folder, model_config['model_name'],
model, train_data=data_config['train_data'],
validation_data=data_config['validation_data'],
test_data=data_config['test_data'], description=model_config['description'])
def parallel_fit_persist_models():
pass
## TEST fit and persist model
from sklearn.linear_model import SGDClassifier
model_config = { 'model_name': 'digits_sgd'
, 'model_template': SGDClassifier
, 'meta_params': {'alpha': 0.1, 'l1_ratio': 0.3}
, 'description': 'SGDClassifier for digits'}
data_config = { 'train_data': 'digits_train'
, 'validation_data': 'digits_valid'
, 'test_data': None}
fit_persist_model('data/blackbox_ensemble/', model_config, data_config)
model = load_model('data/blackbox_ensemble/', 'digits_sgd')['model']
(y, yhat) = predict_model('data/blackbox_ensemble/', 'digits_sgd', data_name='digits_valid', probability=False)
print metrics.classification_report(y, yhat)
precision recall f1-score support 0 0.98 0.98 0.98 56 1 0.94 0.92 0.93 51 2 1.00 1.00 1.00 47 3 0.87 0.96 0.91 55 4 0.98 0.98 0.98 54 5 0.95 1.00 0.97 57 6 1.00 0.95 0.97 60 7 0.97 0.98 0.98 60 8 0.96 0.88 0.92 51 9 1.00 0.96 0.98 49 avg / total 0.96 0.96 0.96 540