import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
# Connect to a pre-existing cluster
h2o.init()
H2O cluster uptime: | 5 seconds 730 milliseconds |
H2O cluster version: | 3.7.0.99999 |
H2O cluster name: | spIdea |
H2O cluster total nodes: | 1 |
H2O cluster total free memory: | 12.44 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
H2O Connection proxy: | None |
Python Version: | 3.5.0 |
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))
Parse Progress: [##################################################] 100%
df.describe()
Rows:380 Cols:9 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
CBS | Bits | 1 | 11.111112 | 118 B | 2.4210093 |
C1N | 1-Byte Integers (w/o NAs) | 5 | 55.555557 | 2.2 KB | 45.958145 |
C2 | 2-Byte Integers | 1 | 11.111112 | 828 B | 16.9881 |
C2S | 2-Byte Fractions | 2 | 22.222223 | 1.6 KB | 34.632744 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.84:54321 | 4.8 KB | 380.0 | 1.0 | 9.0 |
mean | 4.8 KB | 380.0 | 1.0 | 9.0 |
min | 4.8 KB | 380.0 | 1.0 | 9.0 |
max | 4.8 KB | 380.0 | 1.0 | 9.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 4.8 KB | 380.0 | 1.0 | 9.0 |
ID | CAPSULE | AGE | RACE | DPROS | DCAPS | PSA | VOL | GLEASON | |
---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | int | int | real | real | int |
mins | 1.0 | 0.0 | 43.0 | 0.0 | 1.0 | 1.0 | 0.3 | 0.0 | 0.0 |
mean | 190.5 | 0.4026315789473684 | 66.03947368421049 | 1.0868421052631572 | 2.2710526315789488 | 1.1078947368421048 | 15.408631578947375 | 15.812921052631573 | 6.3842105263157904 |
maxs | 380.0 | 1.0 | 79.0 | 2.0 | 4.0 | 2.0 | 139.70000000000002 | 97.60000000000001 | 9.0 |
sigma | 109.84079387914127 | 0.4910743389630552 | 6.527071269173311 | 0.3087732580252793 | 1.0001076181502861 | 0.3106564493514939 | 19.99757266856046 | 18.347619967271175 | 1.0919533744261092 |
zeros | 0 | 227 | 0 | 3 | 0 | 0 | 0 | 167 | 2 |
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 1.0 | 0.0 | 65.0 | 1.0 | 2.0 | 1.0 | 1.4000000000000001 | 0.0 | 6.0 |
1 | 2.0 | 0.0 | 72.0 | 1.0 | 3.0 | 2.0 | 6.7 | 0.0 | 7.0 |
2 | 3.0 | 0.0 | 70.0 | 1.0 | 1.0 | 2.0 | 4.9 | 0.0 | 6.0 |
3 | 4.0 | 0.0 | 76.0 | 2.0 | 2.0 | 1.0 | 51.2 | 20.0 | 7.0 |
4 | 5.0 | 0.0 | 69.0 | 1.0 | 1.0 | 1.0 | 12.3 | 55.9 | 6.0 |
5 | 6.0 | 1.0 | 71.0 | 1.0 | 3.0 | 2.0 | 3.3000000000000003 | 0.0 | 8.0 |
6 | 7.0 | 0.0 | 68.0 | 2.0 | 4.0 | 2.0 | 31.900000000000002 | 0.0 | 7.0 |
7 | 8.0 | 0.0 | 61.0 | 2.0 | 4.0 | 2.0 | 66.7 | 27.2 | 7.0 |
8 | 9.0 | 0.0 | 69.0 | 1.0 | 1.0 | 1.0 | 3.9 | 24.0 | 7.0 |
9 | 10.0 | 0.0 | 68.0 | 2.0 | 1.0 | 2.0 | 13.0 | 0.0 | 6.0 |
# Remove ID from training frame
train = df.drop("ID")
# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None
# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()
# See that the data is ready
train.describe()
Rows:380 Cols:8 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
CBS | Bits | 1 | 12.5 | 118 B | 2.9164608 |
C1N | 1-Byte Integers (w/o NAs) | 5 | 62.5 | 2.2 KB | 55.363323 |
C2S | 2-Byte Fractions | 2 | 25.0 | 1.6 KB | 41.72022 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.84:54321 | 4.0 KB | 380.0 | 1.0 | 8.0 |
mean | 4.0 KB | 380.0 | 1.0 | 8.0 |
min | 4.0 KB | 380.0 | 1.0 | 8.0 |
max | 4.0 KB | 380.0 | 1.0 | 8.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 4.0 KB | 380.0 | 1.0 | 8.0 |
CAPSULE | AGE | RACE | DPROS | DCAPS | PSA | VOL | GLEASON | |
---|---|---|---|---|---|---|---|---|
type | enum | int | int | int | int | real | real | int |
mins | 0.0 | 43.0 | 0.0 | 1.0 | 1.0 | 0.3 | 0.0 | 0.0 |
mean | 0.4026315789473684 | 66.03947368421049 | 1.0868421052631572 | 2.2710526315789488 | 1.1078947368421048 | 15.408631578947375 | 15.812921052631573 | 6.3842105263157904 |
maxs | 1.0 | 79.0 | 2.0 | 4.0 | 2.0 | 139.70000000000002 | 97.60000000000001 | 9.0 |
sigma | 0.4910743389630552 | 6.527071269173311 | 0.3087732580252793 | 1.0001076181502861 | 0.3106564493514939 | 19.99757266856046 | 18.347619967271175 | 1.0919533744261092 |
zeros | 227 | 0 | 3 | 0 | 0 | 0 | 167 | 2 |
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 0 | 65.0 | 1.0 | 2.0 | 1.0 | 1.4000000000000001 | 0.0 | 6.0 |
1 | 0 | 72.0 | 1.0 | 3.0 | 2.0 | 6.7 | 0.0 | 7.0 |
2 | 0 | 70.0 | 1.0 | 1.0 | 2.0 | 4.9 | 0.0 | 6.0 |
3 | 0 | 76.0 | 2.0 | 2.0 | 1.0 | 51.2 | 20.0 | 7.0 |
4 | 0 | 69.0 | 1.0 | 1.0 | 1.0 | 12.3 | 55.9 | 6.0 |
5 | 1 | 71.0 | 1.0 | 3.0 | 2.0 | 3.3000000000000003 | 0.0 | 8.0 |
6 | 0 | 68.0 | 2.0 | 4.0 | 2.0 | 31.900000000000002 | 0.0 | 7.0 |
7 | 0 | 61.0 | 2.0 | 4.0 | 2.0 | 66.7 | 27.2 | 7.0 |
8 | 0 | 69.0 | 1.0 | 1.0 | 1.0 | 3.9 | 24.0 | 7.0 |
9 | 0 | 68.0 | 2.0 | 1.0 | 2.0 | 13.0 | 0.0 | 6.0 |
# Run GBM
my_gbm = H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees=50, learn_rate=0.1)
my_gbm.train(x=list(range(1,train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train)
gbm Model Build Progress: [##################################################] 100%
my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()
ModelMetricsBinomial: gbm ** Reported on test data. ** MSE: 0.07584147467507414 R^2: 0.6846762562816877 LogLoss: 0.2744668128481441 AUC: 0.9780311537243385 Gini: 0.9560623074486769 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4549496668047897:
0 | 1 | Error | Rate | |
0 | 216.0 | 11.0 | 0.0485 | (11.0/227.0) |
1 | 14.0 | 139.0 | 0.0915 | (14.0/153.0) |
Total | 230.0 | 150.0 | 0.0658 | (25.0/380.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
metric | threshold | value | idx |
max f1 | 0.4549497 | 0.9174917 | 149.0 |
max f2 | 0.3032010 | 0.9394314 | 196.0 |
max f0point5 | 0.4728313 | 0.9244265 | 146.0 |
max accuracy | 0.4549497 | 0.9342105 | 149.0 |
max precision | 0.9747938 | 1.0 | 0.0 |
max absolute_MCC | 0.4549497 | 0.8629130 | 149.0 |
max min_per_class_accuracy | 0.4373995 | 0.9215686 | 156.0 |
Gains/Lift Table: Avg response rate: 40.26 %
group | lower_threshold | cumulative_data_fraction | response_rate | cumulative_response_rate | capture_rate | cumulative_capture_rate | lift | cumulative_lift | gain | cumulative_gain | |
1 | 0.9405750 | 0.05 | 1.0 | 1.0 | 0.1241830 | 0.1241830 | 2.4836601 | 2.4836601 | 148.3660131 | 148.3660131 | |
2 | 0.8921980 | 0.1 | 1.0 | 1.0 | 0.1241830 | 0.2483660 | 2.4836601 | 2.4836601 | 148.3660131 | 148.3660131 | |
3 | 0.8263695 | 0.15 | 1.0 | 1.0 | 0.1241830 | 0.3725490 | 2.4836601 | 2.4836601 | 148.3660131 | 148.3660131 | |
4 | 0.7595460 | 0.2 | 0.9473684 | 0.9868421 | 0.1176471 | 0.4901961 | 2.3529412 | 2.4509804 | 135.2941176 | 145.0980392 | |
5 | 0.7081926 | 0.25 | 1.0 | 0.9894737 | 0.1241830 | 0.6143791 | 2.4836601 | 2.4575163 | 148.3660131 | 145.7516340 | |
6 | 0.6364312 | 0.3 | 0.8947368 | 0.9736842 | 0.1111111 | 0.7254902 | 2.2222222 | 2.4183007 | 122.2222222 | 141.8300654 | |
7 | 0.5478651 | 0.35 | 0.6842105 | 0.9323308 | 0.0849673 | 0.8104575 | 1.6993464 | 2.3155929 | 69.9346405 | 131.5592904 | |
8 | 0.4499827 | 0.4 | 0.7894737 | 0.9144737 | 0.0980392 | 0.9084967 | 1.9607843 | 2.2712418 | 96.0784314 | 127.1241830 | |
9 | 0.3927870 | 0.45 | 0.2105263 | 0.8362573 | 0.0261438 | 0.9346405 | 0.5228758 | 2.0769789 | -47.7124183 | 107.6978940 | |
10 | 0.3207657 | 0.5 | 0.3157895 | 0.7842105 | 0.0392157 | 0.9738562 | 0.7843137 | 1.9477124 | -21.5686275 | 94.7712418 | |
11 | 0.2425744 | 0.55 | 0.1578947 | 0.7272727 | 0.0196078 | 0.9934641 | 0.3921569 | 1.8062983 | -60.7843137 | 80.6298277 | |
12 | 0.1977616 | 0.6 | 0.0 | 0.6666667 | 0.0 | 0.9934641 | 0.0 | 1.6557734 | -100.0 | 65.5773420 | |
13 | 0.1586941 | 0.65 | 0.0526316 | 0.6194332 | 0.0065359 | 1.0 | 0.1307190 | 1.5384615 | -86.9281046 | 53.8461538 | |
14 | 0.1353591 | 0.7 | 0.0 | 0.5751880 | 0.0 | 1.0 | 0.0 | 1.4285714 | -100.0 | 42.8571429 | |
15 | 0.1094101 | 0.75 | 0.0 | 0.5368421 | 0.0 | 1.0 | 0.0 | 1.3333333 | -100.0 | 33.3333333 | |
16 | 0.0923828 | 0.8 | 0.0 | 0.5032895 | 0.0 | 1.0 | 0.0 | 1.25 | -100.0 | 25.0 | |
17 | 0.0665933 | 0.85 | 0.0 | 0.4736842 | 0.0 | 1.0 | 0.0 | 1.1764706 | -100.0 | 17.6470588 | |
18 | 0.0477968 | 0.9 | 0.0 | 0.4473684 | 0.0 | 1.0 | 0.0 | 1.1111111 | -100.0 | 11.1111111 | |
19 | 0.0276973 | 0.95 | 0.0 | 0.4238227 | 0.0 | 1.0 | 0.0 | 1.0526316 | -100.0 | 5.2631579 | |
20 | 0.0125566 | 1.0 | 0.0 | 0.4026316 | 0.0 | 1.0 | 0.0 | 1.0 | -100.0 | 0.0 |