import h2o
h2o.init()
H2O cluster uptime: | 2 minutes 13 seconds 669 milliseconds |
H2O cluster version: | 3.1.0.99999 |
H2O cluster name: | ece |
H2O cluster total nodes: | 1 |
H2O cluster total memory: | 4.44 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
# Airlines dataset
air = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
Parse Progress: [##################################################] 100% Imported /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTrain.csv.zip . Parsed 24,421 rows and 12 cols
# Construct validation and training datasets by sampling (20/80)
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]
myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"
# Build gbm
gbm = h2o.gbm(x=air_train[myX],
y=air_train[myY],
validation_x=air_valid[myX],
validation_y=air_valid[myY],
distribution="bernoulli",
ntrees=100,
max_depth=3,
learn_rate=0.01)
gbm Model Build Progress: [##################################################] 100%
# Show various confusion matrices for training dataset (based on metric(s))
print gbm.confusion_matrix() # maximum f1 threshold chosen by default
print gbm.confusion_matrix(metrics="f2")
print gbm.confusion_matrix(metrics="precision")
cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.439860523006:
NO | YES | Error | Rate | |
NO | 2473.0 | 6377.0 | 0.7206 | (6377.0/8850.0) |
YES | 964.0 | 9749.0 | 0.09 | (964.0/10713.0) |
Total | 3437.0 | 16126.0 | 0.8106 | (0.8106/19563.0) |
Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.398529984517:
NO | YES | Error | Rate | |
NO | 355.0 | 8495.0 | 0.9599 | (8495.0/8850.0) |
YES | 60.0 | 10653.0 | 0.0056 | (60.0/10713.0) |
Total | 415.0 | 19148.0 | 0.9655 | (0.9655/19563.0) |
Confusion Matrix (Act/Pred) for max precision @ threshold = 0.684320673544:
NO | YES | Error | Rate | |
NO | 8832.0 | 18.0 | 0.002 | (18.0/8850.0) |
YES | 10562.0 | 151.0 | 0.9859 | (10562.0/10713.0) |
Total | 19394.0 | 169.0 | 0.9879 | (0.9879/19563.0) |
Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.533140769284:
NO | YES | Error | Rate | |
NO | 5093.0 | 3757.0 | 0.4245 | (3757.0/8850.0) |
YES | 3076.0 | 7637.0 | 0.2871 | (3076.0/10713.0) |
Total | 8169.0 | 11394.0 | 0.7116 | (0.7116/19563.0) |
Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.535870076134:
NO | YES | Error | Rate | |
NO | 5141.0 | 3709.0 | 0.4191 | (3709.0/8850.0) |
YES | 3128.0 | 7585.0 | 0.292 | (3128.0/10713.0) |
Total | 8269.0 | 11294.0 | 0.7111 | (0.7111/19563.0) |
# Show various confusion matrices for training dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77)
cms = gbm.confusion_matrix(thresholds=[0.1, 0.5, 0.99])
print cms[0]
print cms[1]
print cms[2]
Could not find exact threshold 0.77; using closest threshold found 0.6869435993. Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:
NO | YES | Error | Rate | |
NO | 8836.0 | 14.0 | 0.0016 | (14.0/8850.0) |
YES | 10597.0 | 116.0 | 0.9892 | (10597.0/10713.0) |
Total | 19433.0 | 130.0 | 0.9908 | (0.9908/19563.0) |
Could not find exact threshold 0.1; using closest threshold found 0.383907658296. Could not find exact threshold 0.5; using closest threshold found 0.500172069127. Could not find exact threshold 0.99; using closest threshold found 0.6869435993. Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:
NO | YES | Error | Rate | |
NO | 0.0 | 8850.0 | 1.0 | (8850.0/8850.0) |
YES | 0.0 | 10713.0 | 0.0 | (0.0/10713.0) |
Total | 0.0 | 19563.0 | 1.0 | (1.0/19563.0) |
Confusion Matrix (Act/Pred) @ threshold = 0.500172069127:
NO | YES | Error | Rate | |
NO | 4223.0 | 4627.0 | 0.5228 | (4627.0/8850.0) |
YES | 2258.0 | 8455.0 | 0.2108 | (2258.0/10713.0) |
Total | 6481.0 | 13082.0 | 0.7336 | (0.7336/19563.0) |
Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:
NO | YES | Error | Rate | |
NO | 8836.0 | 14.0 | 0.0016 | (14.0/8850.0) |
YES | 10597.0 | 116.0 | 0.9892 | (10597.0/10713.0) |
Total | 19433.0 | 130.0 | 0.9908 | (0.9908/19563.0) |
# Show various confusion matrices for validation dataset (based on metric(s))
print gbm.confusion_matrix(metrics="f2", valid=True)
print gbm.confusion_matrix(metrics="precision", valid=True)
cms = gbm.confusion_matrix(metrics=["accuracy", "f0point5"], valid=True)
print cms[0]
print cms[1]
Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.398134501203:
NO | YES | Error | Rate | |
NO | 73.0 | 2143.0 | 0.9671 | (2143.0/2216.0) |
YES | 12.0 | 2630.0 | 0.0045 | (12.0/2642.0) |
Total | 85.0 | 4773.0 | 0.9716 | (0.9716/4858.0) |
Confusion Matrix (Act/Pred) for max precision @ threshold = 0.676912109388:
NO | YES | Error | Rate | |
NO | 2192.0 | 24.0 | 0.0108 | (24.0/2216.0) |
YES | 2488.0 | 154.0 | 0.9417 | (2488.0/2642.0) |
Total | 4680.0 | 178.0 | 0.9525 | (0.9525/4858.0) |
Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.539970628106:
NO | YES | Error | Rate | |
NO | 1308.0 | 908.0 | 0.4097 | (908.0/2216.0) |
YES | 803.0 | 1839.0 | 0.3039 | (803.0/2642.0) |
Total | 2111.0 | 2747.0 | 0.7136 | (0.7136/4858.0) |
Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.548587112251:
NO | YES | Error | Rate | |
NO | 1412.0 | 804.0 | 0.3628 | (804.0/2216.0) |
YES | 919.0 | 1723.0 | 0.3478 | (919.0/2642.0) |
Total | 2331.0 | 2527.0 | 0.7106 | (0.7106/4858.0) |
# Show various confusion matrices for validation dataset (based on threshold(s))
print gbm.confusion_matrix(thresholds=0.77)
cms = gbm.confusion_matrix(thresholds=[0.25, 0.33, 0.44])
print cms[0]
print cms[1]
print cms[2]
Could not find exact threshold 0.77; using closest threshold found 0.6869435993. Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:
NO | YES | Error | Rate | |
NO | 8836.0 | 14.0 | 0.0016 | (14.0/8850.0) |
YES | 10597.0 | 116.0 | 0.9892 | (10597.0/10713.0) |
Total | 19433.0 | 130.0 | 0.9908 | (0.9908/19563.0) |
Could not find exact threshold 0.25; using closest threshold found 0.383907658296. Could not find exact threshold 0.33; using closest threshold found 0.383907658296. Could not find exact threshold 0.44; using closest threshold found 0.439860523006. Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:
NO | YES | Error | Rate | |
NO | 0.0 | 8850.0 | 1.0 | (8850.0/8850.0) |
YES | 0.0 | 10713.0 | 0.0 | (0.0/10713.0) |
Total | 0.0 | 19563.0 | 1.0 | (1.0/19563.0) |
Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:
NO | YES | Error | Rate | |
NO | 0.0 | 8850.0 | 1.0 | (8850.0/8850.0) |
YES | 0.0 | 10713.0 | 0.0 | (0.0/10713.0) |
Total | 0.0 | 19563.0 | 1.0 | (1.0/19563.0) |
Confusion Matrix (Act/Pred) @ threshold = 0.439860523006:
NO | YES | Error | Rate | |
NO | 2473.0 | 6377.0 | 0.7206 | (6377.0/8850.0) |
YES | 964.0 | 9749.0 | 0.09 | (964.0/10713.0) |
Total | 3437.0 | 16126.0 | 0.8106 | (0.8106/19563.0) |
# Show various confusion matrices for validation dataset (based on metric(s) AND threshold(s))
cms = gbm.confusion_matrix(thresholds=0.77, metrics="f1")
print cms[0]
print cms[1]
cms = gbm.confusion_matrix(thresholds=[0.25, 0.33], metrics=["f2", "f0point5"])
print cms[0]
print cms[1]
print cms[2]
print cms[3]
Could not find exact threshold 0.77; using closest threshold found 0.6869435993. Confusion Matrix (Act/Pred) @ threshold = 0.6869435993:
NO | YES | Error | Rate | |
NO | 8836.0 | 14.0 | 0.0016 | (14.0/8850.0) |
YES | 10597.0 | 116.0 | 0.9892 | (10597.0/10713.0) |
Total | 19433.0 | 130.0 | 0.9908 | (0.9908/19563.0) |
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.439860523006:
NO | YES | Error | Rate | |
NO | 2473.0 | 6377.0 | 0.7206 | (6377.0/8850.0) |
YES | 964.0 | 9749.0 | 0.09 | (964.0/10713.0) |
Total | 3437.0 | 16126.0 | 0.8106 | (0.8106/19563.0) |
Could not find exact threshold 0.25; using closest threshold found 0.383907658296. Could not find exact threshold 0.33; using closest threshold found 0.383907658296. Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:
NO | YES | Error | Rate | |
NO | 0.0 | 8850.0 | 1.0 | (8850.0/8850.0) |
YES | 0.0 | 10713.0 | 0.0 | (0.0/10713.0) |
Total | 0.0 | 19563.0 | 1.0 | (1.0/19563.0) |
Confusion Matrix (Act/Pred) @ threshold = 0.383907658296:
NO | YES | Error | Rate | |
NO | 0.0 | 8850.0 | 1.0 | (8850.0/8850.0) |
YES | 0.0 | 10713.0 | 0.0 | (0.0/10713.0) |
Total | 0.0 | 19563.0 | 1.0 | (1.0/19563.0) |
Confusion Matrix (Act/Pred) for max f2 @ threshold = 0.398529984517:
NO | YES | Error | Rate | |
NO | 355.0 | 8495.0 | 0.9599 | (8495.0/8850.0) |
YES | 60.0 | 10653.0 | 0.0056 | (60.0/10713.0) |
Total | 415.0 | 19148.0 | 0.9655 | (0.9655/19563.0) |
Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.535870076134:
NO | YES | Error | Rate | |
NO | 5141.0 | 3709.0 | 0.4191 | (3709.0/8850.0) |
YES | 3128.0 | 7585.0 | 0.292 | (3128.0/10713.0) |
Total | 8269.0 | 11294.0 | 0.7111 | (0.7111/19563.0) |
# Test dataset
air_test = h2o.import_frame(path=h2o.locate("smalldata/airlines/AirlinesTest.csv.zip"))
# Test performance
gbm_perf = gbm.model_performance(air_test)
Parse Progress: [##################################################] 100% Imported /Users/ece/0xdata/h2o-dev/smalldata/airlines/AirlinesTest.csv.zip . Parsed 2,691 rows and 12 cols
# Show various confusion matrices for test dataset (based on metric(s))
print gbm_perf.confusion_matrix(metrics="f0point5")
print gbm_perf.confusion_matrix(metrics="min_per_class_accuracy")
cms = gbm_perf.confusion_matrix(metrics=["accuracy", "f0point5"])
print cms[0]
print cms[1]
Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.532779731095:
NO | YES | Error | Rate | |
NO | 681.0 | 536.0 | 0.4404 | (536.0/1217.0) |
YES | 400.0 | 1074.0 | 0.2714 | (400.0/1474.0) |
Total | 1081.0 | 1610.0 | 0.7118 | (0.7118/2691.0) |
Confusion Matrix (Act/Pred) for max min_per_class_accuracy @ threshold = 0.554747145173:
NO | YES | Error | Rate | |
NO | 776.0 | 441.0 | 0.3624 | (441.0/1217.0) |
YES | 537.0 | 937.0 | 0.3643 | (537.0/1474.0) |
Total | 1313.0 | 1378.0 | 0.7267 | (0.7267/2691.0) |
Confusion Matrix (Act/Pred) for max accuracy @ threshold = 0.523011194769:
NO | YES | Error | Rate | |
NO | 648.0 | 569.0 | 0.4675 | (569.0/1217.0) |
YES | 363.0 | 1111.0 | 0.2463 | (363.0/1474.0) |
Total | 1011.0 | 1680.0 | 0.7138 | (0.7138/2691.0) |
Confusion Matrix (Act/Pred) for max f0point5 @ threshold = 0.532779731095:
NO | YES | Error | Rate | |
NO | 681.0 | 536.0 | 0.4404 | (536.0/1217.0) |
YES | 400.0 | 1074.0 | 0.2714 | (400.0/1474.0) |
Total | 1081.0 | 1610.0 | 0.7118 | (0.7118/2691.0) |
# Show various confusion matrices for test dataset (based on threshold(s))
print gbm_perf.confusion_matrix(thresholds=0.5)
cms = gbm_perf.confusion_matrix(thresholds=[0.01, 0.75, .88])
print cms[0]
print cms[1]
print cms[2]
Could not find exact threshold 0.5; using closest threshold found 0.500253589848. Confusion Matrix (Act/Pred) @ threshold = 0.500253589848:
NO | YES | Error | Rate | |
NO | 563.0 | 654.0 | 0.5374 | (654.0/1217.0) |
YES | 288.0 | 1186.0 | 0.1954 | (288.0/1474.0) |
Total | 851.0 | 1840.0 | 0.7328 | (0.7328/2691.0) |
Could not find exact threshold 0.01; using closest threshold found 0.383989388848. Could not find exact threshold 0.75; using closest threshold found 0.686943579735. Could not find exact threshold 0.88; using closest threshold found 0.686943579735. Confusion Matrix (Act/Pred) @ threshold = 0.383989388848:
NO | YES | Error | Rate | |
NO | 0.0 | 1217.0 | 1.0 | (1217.0/1217.0) |
YES | 0.0 | 1474.0 | 0.0 | (0.0/1474.0) |
Total | 0.0 | 2691.0 | 1.0 | (1.0/2691.0) |
Confusion Matrix (Act/Pred) @ threshold = 0.686943579735:
NO | YES | Error | Rate | |
NO | 1215.0 | 2.0 | 0.0016 | (2.0/1217.0) |
YES | 1458.0 | 16.0 | 0.9891 | (1458.0/1474.0) |
Total | 2673.0 | 18.0 | 0.9907 | (0.9907/2691.0) |
Confusion Matrix (Act/Pred) @ threshold = 0.686943579735:
NO | YES | Error | Rate | |
NO | 1215.0 | 2.0 | 0.0016 | (2.0/1217.0) |
YES | 1458.0 | 16.0 | 0.9891 | (1458.0/1474.0) |
Total | 2673.0 | 18.0 | 0.9907 | (0.9907/2691.0) |
# Convert a ConfusionMatrix to a python list of lists: [ [tns,fps], [fns,tps] ]
cm = gbm.confusion_matrix()
print cm.to_list()
cm = gbm_perf.confusion_matrix()
print cm.to_list()
[[2473, 6377], [964, 9749]] [[248, 969], [91, 1383]]