#----------------------------------------------------------------------
# Purpose: Condition an Airline dataset by filtering out NAs where the
# departure delay in the input dataset is unknown.
#
# Then treat anything longer than minutesOfDelayWeTolerate
# as delayed.
#----------------------------------------------------------------------
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()
H2O cluster uptime: | 19 minutes 25 seconds 953 milliseconds |
H2O cluster version: | 3.5.0.99999 |
H2O cluster name: | spIdea |
H2O cluster total nodes: | 1 |
H2O cluster total memory: | 12.44 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
air = h2o.import_file(_locate("smalldata/airlines/allyears2k_headers.zip"))
Parse Progress: [##################################################] 100%
numRows, numCols = air.dim
print("Original dataset rows: {0}, columns: {1}".format(numRows, numCols))
x_cols = ["Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance"]
y_col = "SynthDepDelayed"
noDepDelayedNAs = air[air["DepDelay"].isna() == 0]
rows, cols = noDepDelayedNAs.dim
print("New dataset rows: {0}, columns: {1}".format(rows, cols))
Original dataset rows: 43978, columns: 31 New dataset rows: 42892, columns: 31
minutesOfDelayWeTolerate = 15
noDepDelayedNAs = noDepDelayedNAs.cbind(noDepDelayedNAs["DepDelay"] > minutesOfDelayWeTolerate)
noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols-1].asfactor()
noDepDelayedNAs.set_name(numCols,y_col)
Year | Month | DayofMonth | DayOfWeek | DepTime | CRSDepTime | ArrTime | CRSArrTime | UniqueCarrier | FlightNum | TailNum | ActualElapsedTime | CRSElapsedTime | AirTime | ArrDelay | DepDelay | Origin | Dest | Distance | TaxiIn | TaxiOut | Cancelled | CancellationCode | Diverted | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay | IsArrDelayed | IsDepDelayed | SynthDepDelayed |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1987 | 10 | 14 | 3 | 741 | 730 | 912 | 849 | 5 | 1451 | 3499 | 91 | 79 | nan | 23 | 11 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
1987 | 10 | 15 | 4 | 729 | 730 | 903 | 849 | 5 | 1451 | 3499 | 94 | 79 | nan | 14 | -1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 0 | 0 |
1987 | 10 | 17 | 6 | 741 | 730 | 918 | 849 | 5 | 1451 | 3499 | 97 | 79 | nan | 29 | 11 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
1987 | 10 | 18 | 7 | 729 | 730 | 847 | 849 | 5 | 1451 | 3499 | 78 | 79 | nan | -2 | -1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 0 | 0 | 0 |
1987 | 10 | 19 | 1 | 749 | 730 | 922 | 849 | 5 | 1451 | 3499 | 93 | 79 | nan | 33 | 19 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
1987 | 10 | 21 | 3 | 728 | 730 | 848 | 849 | 5 | 1451 | 3499 | 80 | 79 | nan | -1 | -2 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 0 | 0 | 0 |
1987 | 10 | 22 | 4 | 728 | 730 | 852 | 849 | 5 | 1451 | 3499 | 84 | 79 | nan | 3 | -2 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 0 | 0 |
1987 | 10 | 23 | 5 | 731 | 730 | 902 | 849 | 5 | 1451 | 3499 | 91 | 79 | nan | 13 | 1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
1987 | 10 | 24 | 6 | 744 | 730 | 908 | 849 | 5 | 1451 | 3499 | 84 | 79 | nan | 19 | 14 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
1987 | 10 | 25 | 7 | 729 | 730 | 851 | 849 | 5 | 1451 | 3499 | 82 | 79 | nan | 2 | -1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 0 | 0 |
gbm = H2OGradientBoostingEstimator(distribution="bernoulli")
gbm.train(x=x_cols, y=y_col, training_frame = noDepDelayedNAs)
gbm.show()
gbm Model Build Progress: [##################################################] 100% Model Details ============= H2OGradientBoostingEstimator : Gradient Boosting Machine Model Key: GBM_model_python_1445841486633_37 Model Summary:
number_of_trees | model_size_in_bytes | min_depth | max_depth | mean_depth | min_leaves | max_leaves | mean_leaves | |
50.0 | 34338.0 | 5.0 | 5.0 | 5.0 | 18.0 | 32.0 | 28.62 |
ModelMetricsBinomial: gbm ** Reported on train data. ** MSE: 0.191672191035 R^2: 0.232789986813 LogLoss: 0.565710073073 AUC: 0.785428554449 Gini: 0.570857108897 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.412557029006:
NO | YES | Error | Rate | |
NO | 11180.0 | 9707.0 | 0.4647 | (9707.0/20887.0) |
YES | 3402.0 | 18603.0 | 0.1546 | (3402.0/22005.0) |
Total | 14582.0 | 28310.0 | 0.3056 | (13109.0/42892.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
metric | threshold | value | idx |
max f1 | 0.4 | 0.7 | 259.0 |
max f2 | 0.2 | 0.8 | 347.0 |
max f0point5 | 0.6 | 0.7 | 180.0 |
max accuracy | 0.5 | 0.7 | 213.0 |
max precision | 1.0 | 1.0 | 0.0 |
max absolute_MCC | 0.5 | 0.4 | 213.0 |
max min_per_class_accuracy | 0.5 | 0.7 | 209.0 |
Scoring History:
timestamp | duration | number_of_trees | training_MSE | training_logloss | training_AUC | training_classification_error | |
2015-10-25 23:57:36 | 0.032 sec | 1.0 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-25 23:57:36 | 0.059 sec | 2.0 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-25 23:57:36 | 0.089 sec | 3.0 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-25 23:57:36 | 0.117 sec | 4.0 | 0.2 | 0.7 | 0.7 | 0.4 | |
2015-10-25 23:57:36 | 0.152 sec | 5.0 | 0.2 | 0.7 | 0.7 | 0.4 | |
--- | --- | --- | --- | --- | --- | --- | --- |
2015-10-25 23:57:38 | 2.267 sec | 46.0 | 0.2 | 0.6 | 0.8 | 0.3 | |
2015-10-25 23:57:38 | 2.323 sec | 47.0 | 0.2 | 0.6 | 0.8 | 0.3 | |
2015-10-25 23:57:38 | 2.378 sec | 48.0 | 0.2 | 0.6 | 0.8 | 0.3 | |
2015-10-25 23:57:38 | 2.435 sec | 49.0 | 0.2 | 0.6 | 0.8 | 0.3 | |
2015-10-25 23:57:38 | 2.493 sec | 50.0 | 0.2 | 0.6 | 0.8 | 0.3 |
Variable Importances:
variable | relative_importance | scaled_importance | percentage |
Origin | 6877.3 | 1.0 | 0.4 |
Dest | 4551.0 | 0.7 | 0.3 |
DayofMonth | 2025.6 | 0.3 | 0.1 |
UniqueCarrier | 1279.5 | 0.2 | 0.1 |
CRSArrTime | 724.8 | 0.1 | 0.0 |
CRSDepTime | 636.9 | 0.1 | 0.0 |
DayOfWeek | 408.2 | 0.1 | 0.0 |
CRSElapsedTime | 118.8 | 0.0 | 0.0 |
Month | 73.3 | 0.0 | 0.0 |
Distance | 31.1 | 0.0 | 0.0 |