import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o.init()
Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.
H2O cluster uptime: | 46 minutes 47 seconds 756 milliseconds |
H2O cluster version: | 3.5.0.99999 |
H2O cluster name: | ludirehak |
H2O cluster total nodes: | 1 |
H2O cluster total memory: | 4.44 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
#uploading data file to h2o
air = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTrain.csv.zip"))
Parse Progress: [##################################################] 100% Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTrain.csv.zip. Parsed 24,421 rows and 12 cols
# Constructing validation and train sets by sampling (20/80)
# creating a column as tall as air.nrow
r = air[0].runif()
air_train = air[r < 0.8]
air_valid = air[r >= 0.8]
myX = ["Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth", "fDayOfWeek"]
myY = "IsDepDelayed"
#gbm
gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
ntrees=100,
max_depth=3,
learn_rate=0.01)
gbm.train(x =myX,
y =myY,
training_frame =air_train,
validation_frame=air_valid)
gbm.show()
gbm Model Build Progress: [##################################################] 100% Model Details ============= H2OGradientBoostingEstimator : Gradient Boosting Machine Model Key: GBM_model_python_1445544453075_131 Model Summary:
number_of_trees | model_size_in_bytes | min_depth | max_depth | mean_depth | min_leaves | max_leaves | mean_leaves | |
100.0 | 21616.0 | 3.0 | 3.0 | 3.0 | 8.0 | 8.0 | 8.0 |
ModelMetricsBinomial: gbm ** Reported on train data. ** MSE: 0.225577653051 R^2: 0.0898968077725 LogLoss: 0.643152070892 AUC: 0.698999790699 Gini: 0.397999581398 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.449028022489:
NO | YES | Error | Rate | |
NO | 2837.0 | 6004.0 | 0.6791 | (6004.0/8841.0) |
YES | 1198.0 | 9446.0 | 0.1126 | (1198.0/10644.0) |
Total | 4035.0 | 15450.0 | 0.3696 | (7202.0/19485.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
metric | threshold | value | idx |
max f1 | 0.4 | 0.7 | 327.0 |
max f2 | 0.4 | 0.9 | 396.0 |
max f0point5 | 0.5 | 0.7 | 217.0 |
max accuracy | 0.5 | 0.7 | 217.0 |
max precision | 0.7 | 1.0 | 0.0 |
max absolute_MCC | 0.5 | 0.3 | 217.0 |
max min_per_class_accuracy | 0.5 | 0.6 | 199.0 |
ModelMetricsBinomial: gbm ** Reported on validation data. ** MSE: 0.226773773291 R^2: 0.0840250526986 LogLoss: 0.64567275652 AUC: 0.689332681253 Gini: 0.378665362506 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.447676388566:
NO | YES | Error | Rate | |
NO | 709.0 | 1516.0 | 0.6813 | (1516.0/2225.0) |
YES | 271.0 | 2440.0 | 0.1 | (271.0/2711.0) |
Total | 980.0 | 3956.0 | 0.362 | (1787.0/4936.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
metric | threshold | value | idx |
max f1 | 0.4 | 0.7 | 332.0 |
max f2 | 0.4 | 0.9 | 390.0 |
max f0point5 | 0.5 | 0.7 | 223.0 |
max accuracy | 0.5 | 0.6 | 278.0 |
max precision | 0.7 | 0.9 | 3.0 |
max absolute_MCC | 0.5 | 0.3 | 223.0 |
max min_per_class_accuracy | 0.5 | 0.6 | 205.0 |
Scoring History:
timestamp | duration | number_of_trees | training_MSE | training_logloss | training_AUC | training_classification_error | validation_MSE | validation_logloss | validation_AUC | validation_classification_error | |
2015-10-22 13:54:21 | 0.064 sec | 1.0 | 0.2 | 0.7 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:21 | 0.105 sec | 2.0 | 0.2 | 0.7 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:21 | 0.127 sec | 3.0 | 0.2 | 0.7 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:21 | 0.148 sec | 4.0 | 0.2 | 0.7 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:21 | 0.165 sec | 5.0 | 0.2 | 0.7 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
--- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
2015-10-22 13:54:25 | 3.670 sec | 74.0 | 0.2 | 0.7 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:25 | 3.759 sec | 75.0 | 0.2 | 0.6 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:25 | 3.848 sec | 76.0 | 0.2 | 0.6 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:25 | 3.942 sec | 77.0 | 0.2 | 0.6 | 0.7 | 0.4 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-22 13:54:25 | 4.253 sec | 100.0 | 0.2 | 0.6 | 0.7 | 0.4 | 0.2 | 0.6 | 0.7 | 0.4 |
Variable Importances:
variable | relative_importance | scaled_importance | percentage |
Origin | 16932.9 | 1.0 | 0.7 |
Dest | 4282.9 | 0.3 | 0.2 |
UniqueCarrier | 1623.9 | 0.1 | 0.1 |
fDayofMonth | 1457.2 | 0.1 | 0.1 |
fDayOfWeek | 95.7 | 0.0 | 0.0 |
fMonth | 67.4 | 0.0 | 0.0 |
Distance | 0.0 | 0.0 | 0.0 |
#glm
glm = H2OGeneralizedLinearEstimator(family = "binomial", solver="L_BFGS")
glm.train(x =myX,
y =myY,
training_frame =air_train,
validation_frame=air_valid)
glm.pprint_coef()
glm Model Build Progress: [##################################################] 100% Coefficients: glm coefficients
names | coefficients | standardized_coefficients |
Intercept | 0.1 | 0.2 |
Origin.ABE | -0.0 | -0.0 |
Origin.ABQ | -0.0 | -0.0 |
Origin.ACY | -0.0 | -0.0 |
Origin.ALB | 0.0 | 0.0 |
--- | --- | --- |
fDayOfWeek.f6 | -0.1 | -0.1 |
fDayOfWeek.f7 | 0.0 | 0.0 |
fMonth.f1 | -0.1 | -0.1 |
fMonth.f10 | 0.1 | 0.1 |
Distance | 0.0 | 0.1 |
#uploading test file to h2o
air_test = h2o.import_file(path=_locate("smalldata/airlines/AirlinesTest.csv.zip"))
Parse Progress: [##################################################] 100% Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols
# predicting & performance on test file
gbm_pred = gbm.predict(air_test)
print("GBM predictions: ")
gbm_pred.head()
gbm_perf = gbm.model_performance(air_test)
print("GBM performance: ")
gbm_perf.show()
glm_pred = glm.predict(air_test)
print("GLM predictions: ")
glm_pred.head()
glm_perf = glm.model_performance(air_test)
print("GLM performance: ")
glm_perf.show()
GBM predictions: H2OFrame with 2691 rows and 3 columns:
predict | YES | YES | YES | YES | YES | YES | NO | NO | NO | NO |
NO | 0.5 | 0.5 | 0.5 | 0.4 | 0.5 | 0.5 | 0.6 | 0.6 | 0.6 | 0.6 |
YES | 0.5 | 0.5 | 0.5 | 0.6 | 0.5 | 0.5 | 0.4 | 0.4 | 0.4 | 0.4 |
GBM performance: ModelMetricsBinomial: gbm ** Reported on test data. ** MSE: 0.226299117103 R^2: 0.086471305524 LogLoss: 0.644721964315 AUC: 0.693439503015 Gini: 0.386879006031 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:
NO | YES | Error | Rate | |
NO | 345.0 | 872.0 | 0.7165 | (872.0/1217.0) |
YES | 136.0 | 1338.0 | 0.0923 | (136.0/1474.0) |
Total | 481.0 | 2210.0 | 0.3746 | (1008.0/2691.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
metric | threshold | value | idx |
max f1 | 0.4 | 0.7 | 335.0 |
max f2 | 0.4 | 0.9 | 391.0 |
max f0point5 | 0.5 | 0.7 | 220.0 |
max accuracy | 0.5 | 0.7 | 225.0 |
max precision | 0.7 | 1.0 | 0.0 |
max absolute_MCC | 0.5 | 0.3 | 225.0 |
max min_per_class_accuracy | 0.6 | 0.6 | 203.0 |
GLM predictions: H2OFrame with 2691 rows and 3 columns:
predict | YES | YES | YES | YES | YES | YES | YES | YES | YES | YES |
p0 | 0.4 | 0.4 | 0.4 | 0.5 | 0.5 | 0.5 | 0.5 | 0.4 | 0.5 | 0.5 |
p1 | 0.6 | 0.6 | 0.6 | 0.5 | 0.5 | 0.5 | 0.5 | 0.6 | 0.5 | 0.5 |
GLM performance: ModelMetricsBinomialGLM: glm ** Reported on test data. ** MSE: 0.232028999965 R^2: 0.0633408025091 LogLoss: 0.656433714264 Null degrees of freedom: 2690 Residual degrees of freedom: 2438 Null deviance: 3705.96023003 Residual deviance: 3532.92625017 AIC: 4038.92625017 AUC: 0.656781919193 Gini: 0.313563838386 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:
NO | YES | Error | Rate | |
NO | 280.0 | 937.0 | 0.7699 | (937.0/1217.0) |
YES | 106.0 | 1368.0 | 0.0719 | (106.0/1474.0) |
Total | 386.0 | 2305.0 | 0.3876 | (1043.0/2691.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
metric | threshold | value | idx |
max f1 | 0.5 | 0.7 | 309.0 |
max f2 | 0.4 | 0.9 | 391.0 |
max f0point5 | 0.5 | 0.7 | 256.0 |
max accuracy | 0.5 | 0.6 | 256.0 |
max precision | 0.8 | 1.0 | 0.0 |
max absolute_MCC | 0.5 | 0.2 | 257.0 |
max min_per_class_accuracy | 0.6 | 0.6 | 192.0 |
# Building confusion matrix for test set
gbm_CM = gbm_perf.confusion_matrix()
print(gbm_CM)
print
glm_CM = glm_perf.confusion_matrix()
print(glm_CM)
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:
NO | YES | Error | Rate | |
NO | 345.0 | 872.0 | 0.7165 | (872.0/1217.0) |
YES | 136.0 | 1338.0 | 0.0923 | (136.0/1474.0) |
Total | 481.0 | 2210.0 | 0.3746 | (1008.0/2691.0) |
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:
NO | YES | Error | Rate | |
NO | 280.0 | 937.0 | 0.7699 | (937.0/1217.0) |
YES | 106.0 | 1368.0 | 0.0719 | (106.0/1474.0) |
Total | 386.0 | 2305.0 | 0.3876 | (1043.0/2691.0) |
# ROC for test set
print('GBM Precision: {0}'.format(gbm_perf.precision()))
print('GBM Accuracy: {0}'.format(gbm_perf.accuracy()))
print('GBM AUC: {0}'.format(gbm_perf.auc()))
print
print('GLM Precision: {0}'.format(glm_perf.precision()))
print('GLM Accuracy: {0}'.format(glm_perf.accuracy()))
print('GLM AUC: {0}'.format(glm_perf.auc()))
GBM Precision: [[0.7017496139979671, 1.0]] GBM Accuracy: [[0.5351575413437655, 0.6596060943887031]] GBM AUC: 0.693439503015 GLM Precision: [[0.7667089295101112, 1.0]] GLM Accuracy: [[0.512905531794376, 0.63173541434411]] GLM AUC: 0.656781919193