In [1]:

import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [2]:

h2o.init()

Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.

H2O cluster uptime:	46 minutes 47 seconds 756 milliseconds
H2O cluster version:	3.5.0.99999
H2O cluster name:	ludirehak
H2O cluster total nodes:	1
H2O cluster total memory:	4.44 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

In [3]:

from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

#uploading data file to h2o
air = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTrain.csv.zip"))

Parse Progress: [##################################################] 100%
Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTrain.csv.zip. Parsed 24,421 rows and 12 cols

In [4]:

# Constructing validation and train sets by sampling (20/80)
# creating a column as tall as air.nrow
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]

myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"

In [5]:

#gbm
gbm = H2OGradientBoostingEstimator(distribution="bernoulli", 
                                   ntrees=100, 
                                   max_depth=3, 
                                   learn_rate=0.01)
gbm.train(x               =myX, 
          y               =myY, 
          training_frame  =air_train,
          validation_frame=air_valid)
gbm.show()

gbm Model Build Progress: [##################################################] 100%
Model Details
=============
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1445544453075_131

Model Summary:

	number_of_trees	model_size_in_bytes	min_depth	max_depth	mean_depth	min_leaves	max_leaves	mean_leaves
	100.0	21616.0	3.0	3.0	3.0	8.0	8.0	8.0


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.225577653051
R^2: 0.0898968077725
LogLoss: 0.643152070892
AUC: 0.698999790699
Gini: 0.397999581398

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.449028022489:

	NO	YES	Error	Rate
NO	2837.0	6004.0	0.6791	(6004.0/8841.0)
YES	1198.0	9446.0	0.1126	(1198.0/10644.0)
Total	4035.0	15450.0	0.3696	(7202.0/19485.0)

Maximum Metrics: Maximum metrics at their respective thresholds

metric	threshold	value	idx
max f1	0.4	0.7	327.0
max f2	0.4	0.9	396.0
max f0point5	0.5	0.7	217.0
max accuracy	0.5	0.7	217.0
max precision	0.7	1.0	0.0
max absolute_MCC	0.5	0.3	217.0
max min_per_class_accuracy	0.5	0.6	199.0

ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.226773773291
R^2: 0.0840250526986
LogLoss: 0.64567275652
AUC: 0.689332681253
Gini: 0.378665362506

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.447676388566:

	NO	YES	Error	Rate
NO	709.0	1516.0	0.6813	(1516.0/2225.0)
YES	271.0	2440.0	0.1	(271.0/2711.0)
Total	980.0	3956.0	0.362	(1787.0/4936.0)

Maximum Metrics: Maximum metrics at their respective thresholds

metric	threshold	value	idx
max f1	0.4	0.7	332.0
max f2	0.4	0.9	390.0
max f0point5	0.5	0.7	223.0
max accuracy	0.5	0.6	278.0
max precision	0.7	0.9	3.0
max absolute_MCC	0.5	0.3	223.0
max min_per_class_accuracy	0.5	0.6	205.0

Scoring History:

	timestamp	duration	number_of_trees	training_MSE	training_logloss	training_AUC	training_classification_error	validation_MSE	validation_logloss	validation_AUC	validation_classification_error
	2015-10-22 13:54:21	0.064 sec	1.0	0.2	0.7	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:21	0.105 sec	2.0	0.2	0.7	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:21	0.127 sec	3.0	0.2	0.7	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:21	0.148 sec	4.0	0.2	0.7	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:21	0.165 sec	5.0	0.2	0.7	0.7	0.4	0.2	0.7	0.7	0.4
---	---	---	---	---	---	---	---	---	---	---	---
	2015-10-22 13:54:25	3.670 sec	74.0	0.2	0.7	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:25	3.759 sec	75.0	0.2	0.6	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:25	3.848 sec	76.0	0.2	0.6	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:25	3.942 sec	77.0	0.2	0.6	0.7	0.4	0.2	0.7	0.7	0.4
	2015-10-22 13:54:25	4.253 sec	100.0	0.2	0.6	0.7	0.4	0.2	0.6	0.7	0.4

Variable Importances:

variable	relative_importance	scaled_importance	percentage
Origin	16932.9	1.0	0.7
Dest	4282.9	0.3	0.2
UniqueCarrier	1623.9	0.1	0.1
fDayofMonth	1457.2	0.1	0.1
fDayOfWeek	95.7	0.0	0.0
fMonth	67.4	0.0	0.0
Distance	0.0	0.0	0.0

In [6]:

#glm
glm = H2OGeneralizedLinearEstimator(family = "binomial", solver="L_BFGS")
glm.train(x               =myX, 
          y               =myY, 
          training_frame  =air_train,
          validation_frame=air_valid)
glm.pprint_coef()
    

glm Model Build Progress: [##################################################] 100%

Coefficients: glm coefficients

names	coefficients	standardized_coefficients
Intercept	0.1	0.2
Origin.ABE	-0.0	-0.0
Origin.ABQ	-0.0	-0.0
Origin.ACY	-0.0	-0.0
Origin.ALB	0.0	0.0
---	---	---
fDayOfWeek.f6	-0.1	-0.1
fDayOfWeek.f7	0.0	0.0
fMonth.f1	-0.1	-0.1
fMonth.f10	0.1	0.1
Distance	0.0	0.1

In [7]:

#uploading test file to h2o
air_test = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTest.csv.zip"))

Parse Progress: [##################################################] 100%
Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols

In [8]:

# predicting & performance on test file
gbm_pred = gbm.predict(air_test)
print("GBM predictions: ")
gbm_pred.head()

gbm_perf = gbm.model_performance(air_test)
print("GBM performance: ")
gbm_perf.show()

glm_pred = glm.predict(air_test)
print("GLM predictions: ")
glm_pred.head()

glm_perf = glm.model_performance(air_test)
print("GLM performance: ")
glm_perf.show()

GBM predictions: 
H2OFrame with 2691 rows and 3 columns:

predict	YES	YES	YES	YES	YES	YES	NO	NO	NO	NO
NO	0.5	0.5	0.5	0.4	0.5	0.5	0.6	0.6	0.6	0.6
YES	0.5	0.5	0.5	0.6	0.5	0.5	0.4	0.4	0.4	0.4

GBM performance: 

ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.226299117103
R^2: 0.086471305524
LogLoss: 0.644721964315
AUC: 0.693439503015
Gini: 0.386879006031

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:

	NO	YES	Error	Rate
NO	345.0	872.0	0.7165	(872.0/1217.0)
YES	136.0	1338.0	0.0923	(136.0/1474.0)
Total	481.0	2210.0	0.3746	(1008.0/2691.0)

Maximum Metrics: Maximum metrics at their respective thresholds

metric	threshold	value	idx
max f1	0.4	0.7	335.0
max f2	0.4	0.9	391.0
max f0point5	0.5	0.7	220.0
max accuracy	0.5	0.7	225.0
max precision	0.7	1.0	0.0
max absolute_MCC	0.5	0.3	225.0
max min_per_class_accuracy	0.6	0.6	203.0

GLM predictions: 
H2OFrame with 2691 rows and 3 columns:

predict	YES	YES	YES	YES	YES	YES	YES	YES	YES	YES
p0	0.4	0.4	0.4	0.5	0.5	0.5	0.5	0.4	0.5	0.5
p1	0.6	0.6	0.6	0.5	0.5	0.5	0.5	0.6	0.5	0.5

GLM performance: 

ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.232028999965
R^2: 0.0633408025091
LogLoss: 0.656433714264
Null degrees of freedom: 2690
Residual degrees of freedom: 2438
Null deviance: 3705.96023003
Residual deviance: 3532.92625017
AIC: 4038.92625017
AUC: 0.656781919193
Gini: 0.313563838386

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:

	NO	YES	Error	Rate
NO	280.0	937.0	0.7699	(937.0/1217.0)
YES	106.0	1368.0	0.0719	(106.0/1474.0)
Total	386.0	2305.0	0.3876	(1043.0/2691.0)

Maximum Metrics: Maximum metrics at their respective thresholds

metric	threshold	value	idx
max f1	0.5	0.7	309.0
max f2	0.4	0.9	391.0
max f0point5	0.5	0.7	256.0
max accuracy	0.5	0.6	256.0
max precision	0.8	1.0	0.0
max absolute_MCC	0.5	0.2	257.0
max min_per_class_accuracy	0.6	0.6	192.0

In [9]:

# Building confusion matrix for test set
gbm_CM = gbm_perf.confusion_matrix()
print(gbm_CM)
print

glm_CM = glm_perf.confusion_matrix()
print(glm_CM)

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:

	NO	YES	Error	Rate
NO	345.0	872.0	0.7165	(872.0/1217.0)
YES	136.0	1338.0	0.0923	(136.0/1474.0)
Total	481.0	2210.0	0.3746	(1008.0/2691.0)



Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:

	NO	YES	Error	Rate
NO	280.0	937.0	0.7699	(937.0/1217.0)
YES	106.0	1368.0	0.0719	(106.0/1474.0)
Total	386.0	2305.0	0.3876	(1043.0/2691.0)

In [10]:

# ROC for test set
print('GBM Precision: {0}'.format(gbm_perf.precision()))
print('GBM Accuracy: {0}'.format(gbm_perf.accuracy()))
print('GBM AUC: {0}'.format(gbm_perf.auc()))
print
print('GLM Precision: {0}'.format(glm_perf.precision()))
print('GLM Accuracy: {0}'.format(glm_perf.accuracy()))
print('GLM AUC: {0}'.format(glm_perf.auc()))

GBM Precision: [[0.7017496139979671, 1.0]]
GBM Accuracy: [[0.5351575413437655, 0.6596060943887031]]
GBM AUC: 0.693439503015

GLM Precision: [[0.7667089295101112, 1.0]]
GLM Accuracy: [[0.512905531794376, 0.63173541434411]]
GLM AUC: 0.656781919193