In [1]:

import h2o

In [2]:

h2o.init()

H2O cluster uptime:	1 minutes 50 seconds 618 milliseconds
H2O cluster version:	3.1.0.99999
H2O cluster name:	ece
H2O cluster total nodes:	1
H2O cluster total memory:	4.44 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

In [3]:

#uploading data file to h2o
air = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

Parse Progress: [##################################################] 100%
Imported  /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTrain.csv.zip . Parsed 24,421 rows and 12 cols

In [4]:

# Constructing validation and train sets by sampling (20/80)
# creating a column as tall as air.nrow()
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]

myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"

In [5]:

#gbm
gbm = h2o.gbm(x=air_train[myX], 
              y=air_train[myY], 
              validation_x=air_valid[myX],
              validation_y=air_valid[myY],
              distribution="bernoulli", 
              ntrees=100, 
              max_depth=3, 
              learn_rate=0.01)
gbm.show()

gbm Model Build Progress: [##################################################] 100%
Model Details
=============
H2OBinomialModel :  Gradient Boosting Machine
Model Key:  GBMModel__83569002bd127b1b24610fe4ac52444c

Model Summary:

	number_of_trees	model_size_in_bytes	min_depth	max_depth	mean_depth	min_leaves	max_leaves	mean_leaves
	100.0	21889.0	3.0	3.0	3.0	8.0	8.0	8.0


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.224935884507
R^2: 0.0917523735414
LogLoss: 0.641870843139
AUC: 0.700860264576
Gini: 0.401720529152

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.45100329685:

	NO	YES	Error	Rate
NO	2703.0	6143.0	0.6944	(6143.0/8846.0)
YES	1067.0	9680.0	0.0993	(1067.0/10747.0)
Total	3770.0	15823.0	0.7937	(0.7937/19593.0)

Maximum Metrics:

metric	threshold	value	idx
f1	0.45100329685	0.728641324802	331.0
f2	0.376803747622	0.859382506486	396.0
f0point5	0.538983613241	0.683115048095	218.0
accuracy	0.521859623661	0.654366355331	240.0
precision	0.681933134563	0.901162790698	8.0
absolute_MCC	0.538983613241	0.299292001087	218.0
min_per_class_accuracy	0.54865448394	0.644551967991	204.0
tns	0.690888629343	8833.0	0.0
fns	0.690888629343	10648.0	0.0
fps	0.371575110378	8846.0	399.0
tps	0.371575110378	10747.0	399.0
tnr	0.690888629343	0.998530409225	0.0
fnr	0.690888629343	0.990788126919	0.0
fpr	0.371575110378	1.0	399.0
tpr	0.371575110378	1.0	399.0

ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.2275183899
R^2: 0.0842002842717
LogLoss: 0.647224224791
AUC: 0.68803214641
Gini: 0.37606429282

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.429662357774:

	NO	YES	Error	Rate
NO	435.0	1785.0	0.8041	(1785.0/2220.0)
YES	137.0	2471.0	0.0525	(137.0/2608.0)
Total	572.0	4256.0	0.8566	(0.8566/4828.0)

Maximum Metrics:

metric	threshold	value	idx
f1	0.429662357774	0.719988344988	356.0
f2	0.376803773922	0.854684009986	396.0
f0point5	0.539014213255	0.674244668246	217.0
accuracy	0.526662150196	0.65057995029	232.0
precision	0.67636982654	0.835443037975	18.0
absolute_MCC	0.539014213255	0.292962334179	217.0
min_per_class_accuracy	0.548567487854	0.631901840491	202.0
tns	0.690888600455	2213.0	0.0
fns	0.690888600455	2589.0	0.0
fps	0.371575143654	2220.0	399.0
tps	0.371575143654	2608.0	399.0
tnr	0.690888600455	0.996846846847	0.0
fnr	0.690888600455	0.992714723926	0.0
fpr	0.371575143654	1.0	399.0
tpr	0.371575143654	1.0	399.0

Scoring History:

	timestamp	duration	number_of_trees	training_MSE	training_logloss	training_AUC	training_classification_error	validation_MSE	validation_logloss	validation_AUC	validation_classification_error
	2015-05-22 13:19:39	0.073 sec	1.0	0.247227169696	0.687586187163	0.662122035392	0.385698974123	0.248060763596	0.689258980589	0.650669457801	0.386909693455
	2015-05-22 13:19:39	0.111 sec	2.0	0.246816106849	0.686756385519	0.66222330505	0.385698974123	0.247675119161	0.688480563888	0.650790619991	0.386909693455
	2015-05-22 13:19:39	0.142 sec	3.0	0.246413615521	0.685943950168	0.66257594751	0.385698974123	0.247291514047	0.687706372792	0.65123718427	0.386909693455
	2015-05-22 13:19:39	0.158 sec	4.0	0.246019285467	0.685148026987	0.662749723193	0.386464553667	0.246920994436	0.686958679422	0.651638409882	0.387116818558
	2015-05-22 13:19:39	0.178 sec	5.0	0.245631966235	0.684366278301	0.662702378116	0.386464553667	0.246552236947	0.68621460685	0.651539096612	0.387116818558
---	---	---	---	---	---	---	---	---	---	---	---
	2015-05-22 13:19:42	3.694 sec	80.0	0.227535224089	0.647373511398	0.697257952158	0.371101924157	0.229781026205	0.651996607902	0.68473434132	0.381731565866
	2015-05-22 13:19:43	3.777 sec	81.0	0.227384102324	0.647055090045	0.697614870507	0.371101924157	0.229651399848	0.651724668276	0.685020968054	0.381731565866
	2015-05-22 13:19:43	3.861 sec	82.0	0.227239988942	0.646750400551	0.697702497293	0.370846730975	0.229522678951	0.651453477146	0.685221321782	0.381731565866
	2015-05-22 13:19:43	3.947 sec	83.0	0.227073325763	0.646400341159	0.697905978041	0.370846730975	0.229386828575	0.651167496306	0.685343433925	0.381731565866
	2015-05-22 13:19:43	4.183 sec	100.0	0.224935884507	0.641870843139	0.700860264576	0.367988567345	0.2275183899	0.647224224791	0.68803214641	0.398094449047

Variable Importances:

variable	relative_importance	scaled_importance	percentage
Origin	17213.3203125	1.0	0.685965839068
Dest	4465.96972656	0.259448476266	0.177972791717
UniqueCarrier	1887.43884277	0.109649899526	0.075216085332
fDayofMonth	1266.3125	0.0735658476698	0.0504636584235
fMonth	203.423248291	0.0118177809161	0.008106594002
fDayOfWeek	57.0886230469	0.00331653754246	0.00227503145812
Distance	0.0	0.0	0.0

In [6]:

#glm
glm = h2o.glm(x=air_train[myX], 
              y=air_train[myY],
              validation_x=air_valid[myX],
              validation_y=air_valid[myY],
              family = "binomial", 
              solver="L_BFGS")
glm.pprint_coef()
    

glm Model Build Progress: [##################################################] 100%

Coefficients:

names	coefficients	standardized_coefficients
Intercept	0.0373707847069	0.195063579531
Origin.ABE	-0.0401578633536	-0.0401578633536
Origin.ABQ	-0.0938267138619	-0.0938267138619
Origin.ACY	-0.135339354063	-0.135339354063
Origin.ALB	0.0711798459683	0.0711798459683
---	---	---
fDayOfWeek.f6	-0.156236716144	-0.156236716144
fDayOfWeek.f7	0.0472831537707	0.0472831537707
fMonth.f1	-0.221575958907	-0.221575958907
fMonth.f10	0.208857303935	0.208857303935
Distance	0.00020866333889	0.131663819411

In [7]:

#uploading test file to h2o
air_test = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))

Parse Progress: [##################################################] 100%
Imported  /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTest.csv.zip . Parsed 2,691 rows and 12 cols

In [8]:

# predicting & performance on test file
gbm_pred = gbm.predict(air_test)
print "GBM predictions: "
gbm_pred.head()

gbm_perf = gbm.model_performance(air_test)
print "GBM performance: "
gbm_perf.show()

glm_pred = glm.predict(air_test)
print "GLM predictions: "
glm_pred.head()

glm_perf = glm.model_performance(air_test)
print "GLM performance: "
glm_perf.show()

GBM predictions: 
First 10 rows and first 3 columns:

Row ID	predict	NO	YES
1	YES	0.47525141674393157	0.5247485832560684
2	YES	0.48024938136117845	0.5197506186388215
3	YES	0.48024938136117845	0.5197506186388215
4	YES	0.402168737810524	0.597831262189476
5	YES	0.5136446294303063	0.48635537056969363
6	YES	0.5136446294303063	0.48635537056969363
7	YES	0.5478525167901855	0.45214748320981446
8	YES	0.5580925509767907	0.4419074490232094
9	YES	0.5580925509767907	0.4419074490232094
10	YES	0.5580925509767907	0.4419074490232094

GBM performance: 

ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.226368400011
R^2: 0.0865312024067
LogLoss: 0.644861693711
AUC: 0.692409878597
Gini: 0.384819757194

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.441901440341:

	NO	YES	Error	Rate
NO	293.0	924.0	0.7592	(924.0/1217.0)
YES	112.0	1362.0	0.076	(112.0/1474.0)
Total	405.0	2286.0	0.8352	(0.8352/2691.0)

Maximum Metrics:

metric	threshold	value	idx
f1	0.441901440341	0.724468085106	339.0
f2	0.383773415183	0.859786810355	391.0
f0point5	0.543468874795	0.6851506265	213.0
accuracy	0.522296596314	0.657748049052	242.0
precision	0.678096525394	0.847222222222	14.0
absolute_MCC	0.543468874795	0.30464489118	213.0
min_per_class_accuracy	0.549319523433	0.642469470828	206.0
tns	0.690888600455	1213.0	0.0
fns	0.690888600455	1461.0	0.0
fps	0.371575143654	1217.0	399.0
tps	0.371575143654	1474.0	399.0
tnr	0.690888600455	0.996713229252	0.0
fnr	0.690888600455	0.99118046133	0.0
fpr	0.371575143654	1.0	399.0
tpr	0.371575143654	1.0	399.0

GLM predictions: 
First 10 rows and first 3 columns:

Row ID	predict	p0	p1
1	YES	0.33138044246038023	0.6686195575396198
2	YES	0.3914744148501228	0.6085255851498772
3	YES	0.36039204225753896	0.639607957742461
4	YES	0.4304740051645429	0.5695259948354571
5	YES	0.5256165167500713	0.4743834832499287
6	YES	0.5562418812088273	0.44375811879117266
7	YES	0.48440139277691874	0.5155986072230813
8	YES	0.44487802611756044	0.5551219738824396
9	YES	0.5819723452658147	0.41802765473418535
10	YES	0.5685108555327485	0.4314891444672515

GLM performance: 

ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.220260505275
R^2: 0.111178508566
LogLoss: 0.630774448994
Null degrees of freedom: 2690
Residual degrees of freedom: 2438
Null deviance: 3705.94255374
Residual deviance: 3394.82808448
AIC: 3900.82808448
AUC: 0.69739355066
Gini: 0.39478710132

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.443988379059:

	NO	YES	Error	Rate
NO	391.0	826.0	0.6787	(826.0/1217.0)
YES	161.0	1313.0	0.1092	(161.0/1474.0)
Total	552.0	2139.0	0.7879	(0.7879/2691.0)

Maximum Metrics:

metric	threshold	value	idx
f1	0.443988379059	0.726819817326	284.0
f2	0.247001441468	0.860535860536	382.0
f0point5	0.569158065903	0.685638454733	183.0
accuracy	0.540614921318	0.655890003716	211.0
precision	0.887237238744	1.0	0.0
absolute_MCC	0.569158065903	0.303360041004	183.0
min_per_class_accuracy	0.563183947037	0.644504748982	189.0
tns	0.887237238744	1217.0	0.0
fns	0.887237238744	1472.0	0.0
fps	0.186084076673	1217.0	399.0
tps	0.215917647428	1474.0	393.0
tnr	0.887237238744	1.0	0.0
fnr	0.887237238744	0.998643147897	0.0
fpr	0.186084076673	1.0	399.0
tpr	0.215917647428	1.0	393.0

In [9]:

# Building confusion matrix for test set
gbm_CM = gbm_perf.confusion_matrix()
print(gbm_CM)
print

glm_CM = glm_perf.confusion_matrix()
print(glm_CM)

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.441901440341:

	NO	YES	Error	Rate
NO	293.0	924.0	0.7592	(924.0/1217.0)
YES	112.0	1362.0	0.076	(112.0/1474.0)
Total	405.0	2286.0	0.8352	(0.8352/2691.0)



Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.443988379059:

	NO	YES	Error	Rate
NO	391.0	826.0	0.6787	(826.0/1217.0)
YES	161.0	1313.0	0.1092	(161.0/1474.0)
Total	552.0	2139.0	0.7879	(0.7879/2691.0)

In [10]:

# ROC for test set
print('GBM Precision: {0}'.format(gbm_perf.precision()))
print('GBM Accuracy: {0}'.format(gbm_perf.accuracy()))
print('GBM AUC: {0}'.format(gbm_perf.auc()))
print
print('GLM Precision: {0}'.format(glm_perf.precision()))
print('GLM Accuracy: {0}'.format(glm_perf.accuracy()))
print('GLM AUC: {0}'.format(glm_perf.auc()))

GBM Precision: [[0.6780965253938488, 0.8472222222222222]]
GBM Accuracy: [[0.5222965963143628, 0.6577480490523968]]
GBM AUC: 0.692409878597

GLM Precision: [[0.8872372387438643, 1.0]]
GLM Accuracy: [[0.5406149213176982, 0.6558900037160906]]
GLM AUC: 0.69739355066