In [1]:

import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [2]:

# Connect to a pre-existing cluster
h2o.init()

H2O cluster uptime:	5 seconds 730 milliseconds
H2O cluster version:	3.7.0.99999
H2O cluster name:	spIdea
H2O cluster total nodes:	1
H2O cluster total free memory:	12.44 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321
H2O Connection proxy:	None
Python Version:	3.5.0

In [3]:

from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))

Parse Progress: [##################################################] 100%

In [4]:

df.describe()

Rows:380 Cols:9

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
CBS	Bits	1	11.111112	118 B	2.4210093
C1N	1-Byte Integers (w/o NAs)	5	55.555557	2.2 KB	45.958145
C2	2-Byte Integers	1	11.111112	828 B	16.9881
C2S	2-Byte Fractions	2	22.222223	1.6 KB	34.632744

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.84:54321	4.8 KB	380.0	1.0	9.0
mean	4.8 KB	380.0	1.0	9.0
min	4.8 KB	380.0	1.0	9.0
max	4.8 KB	380.0	1.0	9.0
stddev	0 B	0.0	0.0	0.0
total	4.8 KB	380.0	1.0	9.0

	ID	CAPSULE	AGE	RACE	DPROS	DCAPS	PSA	VOL	GLEASON
type	int	int	int	int	int	int	real	real	int
mins	1.0	0.0	43.0	0.0	1.0	1.0	0.3	0.0	0.0
mean	190.5	0.4026315789473684	66.03947368421049	1.0868421052631572	2.2710526315789488	1.1078947368421048	15.408631578947375	15.812921052631573	6.3842105263157904
maxs	380.0	1.0	79.0	2.0	4.0	2.0	139.70000000000002	97.60000000000001	9.0
sigma	109.84079387914127	0.4910743389630552	6.527071269173311	0.3087732580252793	1.0001076181502861	0.3106564493514939	19.99757266856046	18.347619967271175	1.0919533744261092
zeros	0	227	0	3	0	0	0	167	2
missing	0	0	0	0	0	0	0	0	0
0	1.0	0.0	65.0	1.0	2.0	1.0	1.4000000000000001	0.0	6.0
1	2.0	0.0	72.0	1.0	3.0	2.0	6.7	0.0	7.0
2	3.0	0.0	70.0	1.0	1.0	2.0	4.9	0.0	6.0
3	4.0	0.0	76.0	2.0	2.0	1.0	51.2	20.0	7.0
4	5.0	0.0	69.0	1.0	1.0	1.0	12.3	55.9	6.0
5	6.0	1.0	71.0	1.0	3.0	2.0	3.3000000000000003	0.0	8.0
6	7.0	0.0	68.0	2.0	4.0	2.0	31.900000000000002	0.0	7.0
7	8.0	0.0	61.0	2.0	4.0	2.0	66.7	27.2	7.0
8	9.0	0.0	69.0	1.0	1.0	1.0	3.9	24.0	7.0
9	10.0	0.0	68.0	2.0	1.0	2.0	13.0	0.0	6.0

In [5]:

# Remove ID from training frame
train = df.drop("ID")

In [6]:

# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None

In [7]:

# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()

In [8]:

# See that the data is ready
train.describe()

Rows:380 Cols:8

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
CBS	Bits	1	12.5	118 B	2.9164608
C1N	1-Byte Integers (w/o NAs)	5	62.5	2.2 KB	55.363323
C2S	2-Byte Fractions	2	25.0	1.6 KB	41.72022

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.84:54321	4.0 KB	380.0	1.0	8.0
mean	4.0 KB	380.0	1.0	8.0
min	4.0 KB	380.0	1.0	8.0
max	4.0 KB	380.0	1.0	8.0
stddev	0 B	0.0	0.0	0.0
total	4.0 KB	380.0	1.0	8.0

	CAPSULE	AGE	RACE	DPROS	DCAPS	PSA	VOL	GLEASON
type	enum	int	int	int	int	real	real	int
mins	0.0	43.0	0.0	1.0	1.0	0.3	0.0	0.0
mean	0.4026315789473684	66.03947368421049	1.0868421052631572	2.2710526315789488	1.1078947368421048	15.408631578947375	15.812921052631573	6.3842105263157904
maxs	1.0	79.0	2.0	4.0	2.0	139.70000000000002	97.60000000000001	9.0
sigma	0.4910743389630552	6.527071269173311	0.3087732580252793	1.0001076181502861	0.3106564493514939	19.99757266856046	18.347619967271175	1.0919533744261092
zeros	227	0	3	0	0	0	167	2
missing	0	0	0	0	0	0	0	0
0	0	65.0	1.0	2.0	1.0	1.4000000000000001	0.0	6.0
1	0	72.0	1.0	3.0	2.0	6.7	0.0	7.0
2	0	70.0	1.0	1.0	2.0	4.9	0.0	6.0
3	0	76.0	2.0	2.0	1.0	51.2	20.0	7.0
4	0	69.0	1.0	1.0	1.0	12.3	55.9	6.0
5	1	71.0	1.0	3.0	2.0	3.3000000000000003	0.0	8.0
6	0	68.0	2.0	4.0	2.0	31.900000000000002	0.0	7.0
7	0	61.0	2.0	4.0	2.0	66.7	27.2	7.0
8	0	69.0	1.0	1.0	1.0	3.9	24.0	7.0
9	0	68.0	2.0	1.0	2.0	13.0	0.0	6.0

In [9]:

# Run GBM
my_gbm = H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees=50, learn_rate=0.1)

my_gbm.train(x=list(range(1,train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train)

gbm Model Build Progress: [##################################################] 100%

In [10]:

my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()

ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.07584147467507414
R^2: 0.6846762562816877
LogLoss: 0.2744668128481441
AUC: 0.9780311537243385
Gini: 0.9560623074486769

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4549496668047897:

	0	1	Error	Rate
0	216.0	11.0	0.0485	(11.0/227.0)
1	14.0	139.0	0.0915	(14.0/153.0)
Total	230.0	150.0	0.0658	(25.0/380.0)

Maximum Metrics: Maximum metrics at their respective thresholds

metric	threshold	value	idx
max f1	0.4549497	0.9174917	149.0
max f2	0.3032010	0.9394314	196.0
max f0point5	0.4728313	0.9244265	146.0
max accuracy	0.4549497	0.9342105	149.0
max precision	0.9747938	1.0	0.0
max absolute_MCC	0.4549497	0.8629130	149.0
max min_per_class_accuracy	0.4373995	0.9215686	156.0

Gains/Lift Table: Avg response rate: 40.26 %

group	lower_threshold	cumulative_data_fraction	response_rate	cumulative_response_rate	capture_rate	cumulative_capture_rate	lift	cumulative_lift	gain	cumulative_gain
1	0.9405750	0.05	1.0	1.0	0.1241830	0.1241830	2.4836601	2.4836601	148.3660131	148.3660131
2	0.8921980	0.1	1.0	1.0	0.1241830	0.2483660	2.4836601	2.4836601	148.3660131	148.3660131
3	0.8263695	0.15	1.0	1.0	0.1241830	0.3725490	2.4836601	2.4836601	148.3660131	148.3660131
4	0.7595460	0.2	0.9473684	0.9868421	0.1176471	0.4901961	2.3529412	2.4509804	135.2941176	145.0980392
5	0.7081926	0.25	1.0	0.9894737	0.1241830	0.6143791	2.4836601	2.4575163	148.3660131	145.7516340
6	0.6364312	0.3	0.8947368	0.9736842	0.1111111	0.7254902	2.2222222	2.4183007	122.2222222	141.8300654
7	0.5478651	0.35	0.6842105	0.9323308	0.0849673	0.8104575	1.6993464	2.3155929	69.9346405	131.5592904
8	0.4499827	0.4	0.7894737	0.9144737	0.0980392	0.9084967	1.9607843	2.2712418	96.0784314	127.1241830
9	0.3927870	0.45	0.2105263	0.8362573	0.0261438	0.9346405	0.5228758	2.0769789	-47.7124183	107.6978940
10	0.3207657	0.5	0.3157895	0.7842105	0.0392157	0.9738562	0.7843137	1.9477124	-21.5686275	94.7712418
11	0.2425744	0.55	0.1578947	0.7272727	0.0196078	0.9934641	0.3921569	1.8062983	-60.7843137	80.6298277
12	0.1977616	0.6	0.0	0.6666667	0.0	0.9934641	0.0	1.6557734	-100.0	65.5773420
13	0.1586941	0.65	0.0526316	0.6194332	0.0065359	1.0	0.1307190	1.5384615	-86.9281046	53.8461538
14	0.1353591	0.7	0.0	0.5751880	0.0	1.0	0.0	1.4285714	-100.0	42.8571429
15	0.1094101	0.75	0.0	0.5368421	0.0	1.0	0.0	1.3333333	-100.0	33.3333333
16	0.0923828	0.8	0.0	0.5032895	0.0	1.0	0.0	1.25	-100.0	25.0
17	0.0665933	0.85	0.0	0.4736842	0.0	1.0	0.0	1.1764706	-100.0	17.6470588
18	0.0477968	0.9	0.0	0.4473684	0.0	1.0	0.0	1.1111111	-100.0	11.1111111
19	0.0276973	0.95	0.0	0.4238227	0.0	1.0	0.0	1.0526316	-100.0	5.2631579
20	0.0125566	1.0	0.0	0.4026316	0.0	1.0	0.0	1.0	-100.0	0.0