import h2o
import pandas
import pprint
import operator
import matplotlib
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from tabulate import tabulate
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/__init__.py:7: DeprecationWarning: bad escape \s from pandas import hashtable, tslib, lib
# Connect to a cluster
h2o.init()
H2O cluster uptime: | 11 seconds 120 milliseconds |
H2O cluster version: | 3.7.0.99999 |
H2O cluster name: | spIdea |
H2O cluster total nodes: | 1 |
H2O cluster total free memory: | 12.44 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
H2O Connection proxy: | None |
Python Version: | 3.5.0 |
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
# air_path = [_locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [_locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [_locate("smalldata/airlines/allyears2k_headers.zip")]
# ----------
# 1- Load data - 1 row per flight. Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()
Import and Parse airlines data Parse Progress: [##################################################] 100% Rows:43,978 Cols:31 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 10 | 5.376344 | 800 B | 0.0504024 |
C0D | Constant Reals | 23 | 12.365591 | 1.8 KB | 0.1159254 |
CBS | Bits | 2 | 1.0752689 | 2.0 KB | 0.1272030 |
CX0 | Sparse Bits | 10 | 5.376344 | 1.9 KB | 0.1247459 |
C1 | 1-Byte Integers | 40 | 21.505377 | 287.8 KB | 18.564957 |
C1N | 1-Byte Integers (w/o NAs) | 19 | 10.215054 | 133.1 KB | 8.58617 |
C1S | 1-Byte Fractions | 6 | 3.2258065 | 43.4 KB | 2.8024976 |
C2 | 2-Byte Integers | 76 | 40.860214 | 1.1 MB | 69.628105 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.84:54321 | 1.5 MB | 43978.0 | 6.0 | 186.0 |
mean | 1.5 MB | 43978.0 | 6.0 | 186.0 |
min | 1.5 MB | 43978.0 | 6.0 | 186.0 |
max | 1.5 MB | 43978.0 | 6.0 | 186.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 1.5 MB | 43978.0 | 6.0 | 186.0 |
Year | Month | DayofMonth | DayOfWeek | DepTime | CRSDepTime | ArrTime | CRSArrTime | UniqueCarrier | FlightNum | TailNum | ActualElapsedTime | CRSElapsedTime | AirTime | ArrDelay | DepDelay | Origin | Dest | Distance | TaxiIn | TaxiOut | Cancelled | CancellationCode | Diverted | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay | IsArrDelayed | IsDepDelayed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | int | int | int | int | enum | int | enum | int | int | int | int | int | enum | enum | int | int | int | int | enum | int | int | int | int | int | int | enum | enum |
mins | 1987.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 16.0 | 17.0 | 14.0 | -63.0 | -16.0 | 0.0 | 0.0 | 11.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
mean | 1997.5 | 1.409090909090909 | 14.601073263904679 | 3.820614852880991 | 1345.8466613820763 | 1313.2228614307164 | 1504.6341303788884 | 1485.289167310927 | NaN | 818.8429896766577 | NaN | 124.8145291354043 | 125.02156260661899 | 114.31611109078277 | 9.317111936984313 | 10.0073906556001 | NaN | NaN | 730.1821905650501 | 5.381368059530628 | 14.168634184732056 | 0.024694165264450407 | NaN | 0.0024785119832643593 | 4.047800291055627 | 0.2893764692712417 | 4.855031904175534 | 0.017015560282100096 | 7.620060450016789 | 0.555755150302424 | 0.5250579835372226 |
maxs | 2008.0 | 10.0 | 31.0 | 7.0 | 2400.0 | 2359.0 | 2400.0 | 2359.0 | 9.0 | 3949.0 | 3500.0 | 475.0 | 437.0 | 402.0 | 475.0 | 473.0 | 131.0 | 133.0 | 3365.0 | 128.0 | 254.0 | 1.0 | 3.0 | 1.0 | 369.0 | 201.0 | 323.0 | 14.0 | 373.0 | 1.0 | 1.0 |
sigma | 6.344360901711177 | 1.874711371343963 | 9.175790425861443 | 1.9050131191328936 | 465.340899124234 | 476.25113999259946 | 484.34748790351614 | 492.75043412270094 | NaN | 777.4043691636349 | NaN | 73.97444166059017 | 73.4015946300093 | 69.63632951506109 | 29.840221962414848 | 26.438809042916454 | NaN | NaN | 578.438008230424 | 4.201979939864828 | 9.905085747204327 | 0.15519314135784237 | NaN | 0.049723487218862286 | 16.20572990448423 | 4.416779898734124 | 18.619776221475682 | 0.40394018210151184 | 23.487565874106213 | 0.4968872883428837 | 0.49937738031758017 |
zeros | 0 | 0 | 0 | 0 | 0 | 569 | 0 | 569 | 724 | 0 | 2 | 0 | 0 | -8878 | 1514 | 6393 | 59 | 172 | 0 | -8255 | -8321 | 42892 | 81 | 43869 | -23296 | -21800 | -23252 | -21726 | -23500 | 19537 | 20887 |
missing | 0 | 0 | 0 | 0 | 1086 | 0 | 1195 | 0 | 0 | 0 | 32 | 1195 | 13 | 16649 | 1195 | 1086 | 0 | 0 | 35 | 16026 | 16024 | 0 | 9774 | 0 | 35045 | 35045 | 35045 | 35045 | 35045 | 0 | 0 |
0 | 1987.0 | 10.0 | 14.0 | 3.0 | 741.0 | 730.0 | 912.0 | 849.0 | PS | 1451.0 | NA | 91.0 | 79.0 | nan | 23.0 | 11.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | YES |
1 | 1987.0 | 10.0 | 15.0 | 4.0 | 729.0 | 730.0 | 903.0 | 849.0 | PS | 1451.0 | NA | 94.0 | 79.0 | nan | 14.0 | -1.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | NO |
2 | 1987.0 | 10.0 | 17.0 | 6.0 | 741.0 | 730.0 | 918.0 | 849.0 | PS | 1451.0 | NA | 97.0 | 79.0 | nan | 29.0 | 11.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | YES |
3 | 1987.0 | 10.0 | 18.0 | 7.0 | 729.0 | 730.0 | 847.0 | 849.0 | PS | 1451.0 | NA | 78.0 | 79.0 | nan | -2.0 | -1.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | NO | NO |
4 | 1987.0 | 10.0 | 19.0 | 1.0 | 749.0 | 730.0 | 922.0 | 849.0 | PS | 1451.0 | NA | 93.0 | 79.0 | nan | 33.0 | 19.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | YES |
5 | 1987.0 | 10.0 | 21.0 | 3.0 | 728.0 | 730.0 | 848.0 | 849.0 | PS | 1451.0 | NA | 80.0 | 79.0 | nan | -1.0 | -2.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | NO | NO |
6 | 1987.0 | 10.0 | 22.0 | 4.0 | 728.0 | 730.0 | 852.0 | 849.0 | PS | 1451.0 | NA | 84.0 | 79.0 | nan | 3.0 | -2.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | NO |
7 | 1987.0 | 10.0 | 23.0 | 5.0 | 731.0 | 730.0 | 902.0 | 849.0 | PS | 1451.0 | NA | 91.0 | 79.0 | nan | 13.0 | 1.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | YES |
8 | 1987.0 | 10.0 | 24.0 | 6.0 | 744.0 | 730.0 | 908.0 | 849.0 | PS | 1451.0 | NA | 84.0 | 79.0 | nan | 19.0 | 14.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | YES |
9 | 1987.0 | 10.0 | 25.0 | 7.0 | 729.0 | 730.0 | 851.0 | 849.0 | PS | 1451.0 | NA | 82.0 | 79.0 | nan | 2.0 | -1.0 | SAN | SFO | 447.0 | nan | nan | 0.0 | NA | 0.0 | nan | nan | nan | nan | nan | YES | NO |
# ----------
# 2- Data exploration and munging. Generate scatter plots
# of various columns and plot fitted GLM model.
# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
if(fit):
lr = H2OGeneralizedLinearEstimator(family = "gaussian")
lr.train(x=x, y=y, training_frame=data)
coeff = lr.coef()
df = data[[x,y]]
runif = df[y].runif()
df_subset = df[runif < float(max_points)/data.nrow]
df_py = h2o.as_list(df_subset)
if(fit): h2o.remove(lr._id)
# If x variable is string, generate box-and-whisker plot
if(df_py[x].dtype == "object"):
if interactive: df_py.boxplot(column = y, by = x)
# Otherwise, generate a scatter plot
else:
if interactive: df_py.plot(x = x, y = y, kind = "scatter")
if(fit):
x_min = min(df_py[x])
x_max = max(df_py[x])
y_min = coeff["Intercept"] + coeff[x]*x_min
y_max = coeff["Intercept"] + coeff[x]*x_max
plt.plot([x_min, x_max], [y_min, y_max], "k-")
if interactive: plt.show()
scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)
glm Model Build Progress: [##################################################] 100%
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim
# Convert columns to factors
data["Year"] = data["Year"] .asfactor()
data["Month"] = data["Month"] .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()
Month | sum_Cancelled | nrow_Year |
---|---|---|
1 | 1067 | 41979 |
10 | 19 | 1999 |
Rows:2 Cols:3 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C1N | 1-Byte Integers (w/o NAs) | 1 | 33.333336 | 70 B | 30.434782 |
C2 | 2-Byte Integers | 1 | 33.333336 | 72 B | 31.304348 |
C2S | 2-Byte Fractions | 1 | 33.333336 | 88 B | 38.260868 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.84:54321 | 230 B | 2.0 | 1.0 | 3.0 |
mean | 230 B | 2.0 | 1.0 | 3.0 |
min | 230 B | 2.0 | 1.0 | 3.0 |
max | 230 B | 2.0 | 1.0 | 3.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 230 B | 2.0 | 1.0 | 3.0 |
Month | sum_Cancelled | nrow_Year | |
---|---|---|---|
type | int | int | int |
mins | 1.0 | 19.0 | 1999.0 |
mean | 5.5 | 543.0 | 21989.0 |
maxs | 10.0 | 1067.0 | 41979.0 |
sigma | 6.363961030678928 | 741.0479066835018 | 28270.12911183817 |
zeros | 0 | 0 | 0 |
missing | 0 | 0 | 0 |
0 | 1.0 | 1067.0 | 41979.0 |
1 | 10.0 | 19.0 | 1999.0 |
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1
hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2
# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]] * data.nrow))
scatter_plot(data, "Distance", "TravelTime")
Parse Progress: [##################################################] 100% glm Model Build Progress: [##################################################] 100%
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")
glm Model Build Progress: [##################################################] 100%
# ----------
# 3- Fit a model on train; using test as validation
# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test = data[s > 0.75]
# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
# Simple GLM - Predict Delays
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Simple GBM
data_gbm = H2OGradientBoostingEstimator(balance_classes=True,
ntrees =3,
max_depth =1,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
data_gbm.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Complex GBM
data_gbm2 = H2OGradientBoostingEstimator(balance_classes=True,
ntrees =50,
max_depth =5,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
data_gbm2.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Simple Random Forest
data_rf = H2ORandomForestEstimator(ntrees =5,
max_depth =2,
balance_classes=True)
data_rf.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Complex Random Forest
data_rf2 = H2ORandomForestEstimator(ntrees =10,
max_depth =5,
balance_classes=True)
data_rf2.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Deep Learning with 5 epochs
data_dl = H2ODeepLearningEstimator(hidden =[10,10],
epochs =5,
variable_importances=True,
balance_classes =True,
loss ="Automatic")
data_dl.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
glm Model Build Progress: [##################################################] 100% gbm Model Build Progress: [##################################################] 100% gbm Model Build Progress: [##################################################] 100% drf Model Build Progress: [##################################################] 100% drf Model Build Progress: [##################################################] 100% deeplearning Model Build Progress: [##################################################] 100%
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
glm_varimp[k] = abs(glm_varimp[k])
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)
data_gbm.varimp()
data_rf.varimp()
Variable Importances: | Predictor | Normalized Coefficient | |------------------+--------------------------| | Year.2008 | 2.1663 | | Dest.HTS | 1.59911 | | Year.2003 | 1.59565 | | Origin.MDW | 1.58362 | | Year.2007 | 1.37479 | | Origin.HPN | 1.34354 | | Origin.LIH | 1.32598 | | Dest.LYH | 1.29275 | | Origin.LBB | 1.21984 | | Origin.LEX | 1.21291 | | Origin.ERI | 1.20959 | | Origin.TLH | 1.17343 | | Origin.CAE | 1.15044 | | UniqueCarrier.HP | 1.12944 | | Origin.PSP | 1.11685 | | Origin.HNL | 1.11194 | | Origin.TRI | 1.02187 | | UniqueCarrier.TW | 1.0169 | | Year.2001 | 0.979973 | | Year.2002 | 0.944374 | | Origin.SDF | 0.939753 | | Origin.ATL | 0.935832 | | Origin.GRR | 0.884671 | | Origin.PBI | 0.882257 | | Origin.CHO | 0.878584 | | Origin.OGG | 0.864754 | | Origin.SRQ | 0.856535 | | Year.2004 | 0.846669 | | Origin.MYR | 0.835173 | | Origin.ACY | 0.804102 | | Origin.ORD | 0.787865 | | Year.1994 | 0.781128 | | Origin.MAF | 0.766548 | | Origin.TUL | 0.765077 | | Origin.MRY | 0.759124 | | Year.2006 | 0.749834 | | Origin.STL | 0.737706 | | Origin.LYH | 0.728328 | | Dest.CHO | 0.728328 | | Origin.CMH | 0.703809 | | Dest.GSO | 0.694797 | | Origin.BTV | 0.678703 | | Origin.ROA | 0.672739 | | Dest.ISP | 0.666122 | | Dest.LIH | 0.647256 | | Origin.AUS | 0.646233 | | Origin.IAH | 0.637049 | | Dest.FLL | 0.624057 | | Origin.MLB | 0.611271 | | Dest.PBI | 0.609092 | | Origin.PIT | 0.604604 | | Origin.PWM | 0.603332 | | Dest.ICT | 0.601697 | | Year.1996 | 0.601507 | | Origin.TYS | 0.590041 | | Origin.MSY | 0.587653 | | Year.1990 | 0.564752 | | Dest.DAY | 0.564026 | | Origin.SYR | 0.560879 | | Dest.IAH | 0.553572 | | Dest.EUG | 0.54793 | | Origin.JAX | 0.542031 | | Origin.BOI | 0.541044 | | Dest.TOL | 0.528751 | | Dest.TPA | 0.51248 | | Dest.BUF | 0.512192 | | Dest.PSP | 0.508527 | | Origin.ALB | 0.506946 | | Origin.SAV | 0.50483 | | Origin.CRW | 0.504431 | | Dest.PNS | 0.503218 | | UniqueCarrier.CO | 0.499991 | | Dest.SFO | 0.499403 | | Origin.PHL | 0.498516 | | Year.1997 | 0.492557 | | Origin.OKC | 0.491762 | | Origin.LGA | 0.488253 | | Origin.MIA | 0.480325 | | Origin.OMA | 0.477082 | | Dest.CHS | 0.475901 | | Dest.CAK | 0.473522 | | Origin.FLL | 0.469294 | | Origin.ICT | 0.464117 | | Dest.GEG | 0.461246 | | Origin.EGE | 0.461207 | | Dest.ABQ | 0.461191 | | Dest.EYW | 0.452089 | | Year.2005 | 0.45045 | | Dest.IND | 0.449927 | | UniqueCarrier.WN | 0.446792 | | Origin.IND | 0.446311 | | Origin.GSO | 0.442529 | | Origin.MCO | 0.434966 | | Origin.LAX | 0.433672 | | Origin.BDL | 0.418545 | | Dest.CAE | 0.414453 | | Dest.SMF | 0.409427 | | Origin.CRP | 0.403216 | | Origin.DFW | 0.399445 | | Dest.BDL | 0.395146 | | Dest.CVG | 0.391672 | | Dest.UCA | 0.39075 | | Origin.DSM | 0.387103 | | Origin.MEM | 0.383554 | | Origin.EYW | 0.375727 | | Dest.CLE | 0.372843 | | Dest.FAT | 0.369287 | | UniqueCarrier.PI | 0.366404 | | Origin.SLC | 0.354344 | | Origin.JFK | 0.34159 | | Origin.BWI | 0.339737 | | Dest.MIA | 0.338326 | | Origin.ROC | 0.328992 | | Origin.OAK | 0.327167 | | Dest.BGM | 0.323214 | | Origin.IAD | 0.320497 | | Dest.JAX | 0.319508 | | Dest.MKE | 0.31828 | | Year.1992 | 0.31714 | | Dest.MCO | 0.315641 | | Dest.FAY | 0.315447 | | Dest.COS | 0.314929 | | Origin.RNO | 0.314859 | | Origin.MCI | 0.313843 | | Dest.SAT | 0.305571 | | Year.1995 | 0.29602 | | Origin.SAN | 0.292782 | | Dest.OGG | 0.281564 | | Year.1991 | 0.274708 | | Dest.BUR | 0.270584 | | Dest.ALB | 0.268558 | | Dest.TUL | 0.26762 | | Origin.DAY | 0.264843 | | Origin.BUR | 0.264689 | | Origin.CLT | 0.256984 | | Origin.ONT | 0.256321 | | Origin.MKE | 0.254529 | | Origin.HRL | 0.253809 | | DayOfWeek.5 | 0.244342 | | UniqueCarrier.US | 0.239344 | | Dest.BTV | 0.23824 | | Origin.ABE | 0.234584 | | Origin.TPA | 0.22891 | | Dest.STT | 0.225113 | | Origin.STX | 0.223986 | | Dest.GSP | 0.221914 | | Origin.BHM | 0.219408 | | Dest.IAD | 0.219399 | | Origin.BOS | 0.21936 | | Origin.MDT | 0.217089 | | Dest.PVD | 0.21636 | | Dest.RSW | 0.208373 | | Origin.ELP | 0.207048 | | Origin.DEN | 0.205402 | | Dest.LIT | 0.204071 | | Month.10 | 0.203185 | | Year.1987 | 0.203185 | | Dest.BWI | 0.202309 | | Origin.MSP | 0.201702 | | Dest.PDX | 0.201547 | | Dest.ROC | 0.199012 | | Origin.TUS | 0.197624 | | Dest.KOA | 0.197388 | | Dest.CLT | 0.191233 | | Dest.OAJ | 0.188976 | | Year.1999 | 0.186221 | | Origin.SJC | 0.182876 | | Dest.DAL | 0.179589 | | Origin.BUF | 0.178246 | | DayOfWeek.2 | 0.17761 | | Origin.DAL | 0.175027 | | Origin.CLE | 0.173502 | | Dest.GRR | 0.169856 | | Dest.PWM | 0.16768 | | UniqueCarrier.AA | 0.167342 | | Year.1993 | 0.166087 | | Dest.RNO | 0.165744 | | Distance | 0.163211 | | Dest.LBB | 0.157175 | | Dest.HRL | 0.156284 | | Dest.ABE | 0.155532 | | Dest.CMH | 0.154857 | | Dest.CRP | 0.151555 | | Dest.SNA | 0.151435 | | Origin.SFO | 0.150441 | | Dest.SEA | 0.149936 | | Dest.ROA | 0.148303 | | Year.2000 | 0.146046 | | Dest.ORF | 0.134053 | | Dest.SAN | 0.133593 | | DayOfWeek.6 | 0.132748 | | Dest.MSP | 0.132271 | | Origin.COS | 0.128671 | | Dest.HOU | 0.127342 | | Dest.TUS | 0.120346 | | DayOfWeek.4 | 0.119748 | | Dest.DSM | 0.116603 | | Dest.LAX | 0.11609 | | Dest.SLC | 0.114966 | | Dest.AVP | 0.112227 | | Dest.STL | 0.110793 | | Origin.ORF | 0.108536 | | Dest.BHM | 0.108348 | | UniqueCarrier.UA | 0.107298 | | Origin.DTW | 0.105773 | | Dest.MDW | 0.10405 | | Dest.DFW | 0.0989164 | | Origin.CVG | 0.0967693 | | Origin.SMF | 0.0959796 | | Origin.RSW | 0.0934595 | | Origin.SWF | 0.0927228 | | Month.1 | 0.092347 | | Dest.PHL | 0.0848795 | | Dest.PHX | 0.0848389 | | Origin.RDU | 0.0839633 | | Origin.DCA | 0.0832363 | | Dest.OAK | 0.0818515 | | Dest.MCI | 0.0815358 | | Dest.EWR | 0.0785491 | | Dest.DEN | 0.0783454 | | Dest.DTW | 0.0774459 | | Year.1989 | 0.0762646 | | Dest.LAS | 0.0743316 | | Dest.MDT | 0.0731147 | | Dest.RIC | 0.0723303 | | Dest.OMA | 0.0661859 | | UniqueCarrier.PS | 0.0645156 | | Year.1998 | 0.05845 | | Dest.MHT | 0.0576363 | | Origin.BNA | 0.0553462 | | Origin.PHX | 0.0522407 | | Origin.GNV | 0.0504304 | | Dest.MSY | 0.0501866 | | Origin.PVD | 0.0490418 | | Origin.MFR | 0.0437977 | | Origin.SNA | 0.0421396 | | FlightNum | 0.0376186 | | Origin.SEA | 0.0372322 | | Dest.BNA | 0.0347007 | | Origin.PHF | 0.029703 | | Dest.LGA | 0.0291171 | | Intercept | 0.026855 | | Dest.ORD | 0.0244753 | | DayOfWeek.7 | 0.0234737 | | Dest.SJC | 0.0177833 | | Dest.AVL | 0.0172911 | | Dest.BOS | 0.0162872 | | DayOfWeek.1 | 0.0153713 | | Origin.PDX | 0.0112833 | | Origin.RIC | 0.011192 | | Origin.SAT | 0.0110852 | | Year.1988 | 0.00996483 | | Origin.BGM | 0.00952641 | | Dest.PIT | 0.00935131 | | Dest.ATL | 0.00882664 | | Origin.CHS | 0.00818887 | | Origin.ABQ | 0.00803383 | | Dest.ILM | 0.00255637 | | UniqueCarrier.DL | 0.00110988 | | Origin.GEG | 0 | | Origin.SBN | 0 | | Origin.STT | 0 | | Origin.ANC | 0 | | Dest.AMA | 0 | | Dest.RDU | 0 | | Dest.FNT | 0 | | Dest.LEX | 0 | | Origin.HOU | 0 | | Origin.LAS | 0 | | Dest.ACY | 0 | | Dest.AUS | 0 | | Dest.SDF | 0 | | Dest.DCA | 0 | | Dest.MRY | 0 | | Dest.SCK | 0 | | Origin.EWR | 0 | | Dest.PHF | 0 | | Dest.BOI | 0 | | Origin.AVP | 0 | | Origin.LAN | 0 | | Dest.SBN | 0 | | Dest.JFK | 0 | | Dest.SJU | 0 | | Origin.UCA | 0 | | DayOfWeek.3 | 0 | | Dest.SYR | 0 | | Origin.KOA | 0 | | Origin.MHT | 0 | | Origin.LIT | 0 | | Dest.JAN | 0 | | Origin.SCK | 0 | | Dest.ERI | 0 | | Dest.ELM | 0 | | Dest.HNL | 0 | | Dest.OKC | 0 | | Dest.HPN | 0 | | Origin.BIL | 0 | | Dest.ORH | 0 | | Dest.MYR | 0 | | Dest.SRQ | 0 | | Dest.ANC | 0 | | Dest.CHA | 0 | | Dest.SWF | 0 | | Origin.JAN | 0 | | Origin.AMA | 0 | | Dest.ONT | 0 | | Dest.ELP | 0 | | Origin.ISP | 0 | | Dest.MAF | 0 | | Origin.SJU | 0 |
[('Year', 860.6602783203125, 1.0, 0.5018886676744018), ('Dest', 593.151123046875, 0.6891814784394192, 0.3458923739998345), ('UniqueCarrier', 87.23373413085938, 0.1013567563511901, 0.05086980740489776), ('DayOfWeek', 80.93794250488281, 0.09404168467358974, 0.04719845582668416), ('Distance', 65.31503295898438, 0.07588944744429815, 0.03808805366836533), ('FlightNum', 27.54490852355957, 0.032004391532181486, 0.01606264142581647), ('Month', 0.0, 0.0, 0.0), ('Origin', 0.0, 0.0, 0.0)]
# Model performance of GBM model on test data
data_gbm2.model_performance(test)
ModelMetricsBinomial: gbm ** Reported on test data. ** MSE: 0.20407778554922562 R^2: 0.18116065189707653 LogLoss: 0.5945117554029998 AUC: 0.7467255149856272 Gini: 0.49345102997125445 Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3514986726263641:
NO | YES | Error | Rate | |
NO | 1968.0 | 3199.0 | 0.6191 | (3199.0/5167.0) |
YES | 657.0 | 5118.0 | 0.1138 | (657.0/5775.0) |
Total | 2625.0 | 8317.0 | 0.3524 | (3856.0/10942.0) |
Maximum Metrics: Maximum metrics at their respective thresholds
metric | threshold | value | idx |
max f1 | 0.3514987 | 0.7263696 | 287.0 |
max f2 | 0.1882069 | 0.8505254 | 372.0 |
max f0point5 | 0.5203289 | 0.7060683 | 199.0 |
max accuracy | 0.4815086 | 0.6868945 | 220.0 |
max precision | 0.9607084 | 1.0 | 0.0 |
max absolute_MCC | 0.5011300 | 0.3721374 | 209.0 |
max min_per_class_accuracy | 0.5067588 | 0.6851171 | 206.0 |
Gains/Lift Table: Avg response rate: 52.78 %
group | lower_threshold | cumulative_data_fraction | response_rate | cumulative_response_rate | capture_rate | cumulative_capture_rate | lift | cumulative_lift | gain | cumulative_gain | |
1 | 0.8528015 | 0.0500823 | 0.8978102 | 0.8978102 | 0.0851948 | 0.0851948 | 1.7010977 | 1.7010977 | 70.1097734 | 70.1097734 | |
2 | 0.7962821 | 0.1001645 | 0.8321168 | 0.8649635 | 0.0789610 | 0.1641558 | 1.5766272 | 1.6388625 | 57.6627168 | 63.8862451 | |
3 | 0.7494101 | 0.1500640 | 0.7893773 | 0.8398295 | 0.0746320 | 0.2387879 | 1.4956478 | 1.5912405 | 49.5647844 | 59.1240542 | |
4 | 0.7173250 | 0.2000548 | 0.7787934 | 0.8245774 | 0.0737662 | 0.3125541 | 1.4755944 | 1.5623422 | 47.5594387 | 56.2342211 | |
5 | 0.6855407 | 0.2501371 | 0.7408759 | 0.8078188 | 0.0703030 | 0.3828571 | 1.4037514 | 1.5305893 | 40.3751382 | 53.0589279 | |
6 | 0.6556998 | 0.3002193 | 0.7043796 | 0.7905632 | 0.0668398 | 0.4496970 | 1.3346011 | 1.4978947 | 33.4601068 | 49.7894747 | |
7 | 0.6236469 | 0.3500274 | 0.6495413 | 0.7704961 | 0.0612987 | 0.5109957 | 1.2306980 | 1.4598733 | 23.0697963 | 45.9873272 | |
8 | 0.5865343 | 0.4001097 | 0.5985401 | 0.7489721 | 0.0567965 | 0.5677922 | 1.1340652 | 1.4190914 | 13.4065156 | 41.9091443 | |
9 | 0.5478266 | 0.4500091 | 0.5787546 | 0.7300975 | 0.0547186 | 0.6225108 | 1.0965771 | 1.3833293 | 9.6577074 | 38.3329289 | |
10 | 0.5128311 | 0.5 | 0.5557587 | 0.7126668 | 0.0526407 | 0.6751515 | 1.0530063 | 1.3503030 | 5.3006323 | 35.0303030 | |
11 | 0.4815168 | 0.5508134 | 0.5161871 | 0.6945412 | 0.0496970 | 0.7248485 | 0.9780292 | 1.3159602 | -2.1970787 | 31.5960199 | |
12 | 0.4483592 | 0.6001645 | 0.45 | 0.6744328 | 0.0420779 | 0.7669264 | 0.8526234 | 1.2778603 | -14.7376623 | 27.7860324 | |
13 | 0.4159386 | 0.6501554 | 0.4223035 | 0.6550464 | 0.04 | 0.8069264 | 0.8001463 | 1.2411286 | -19.9853748 | 24.1128584 | |
14 | 0.3884948 | 0.7000548 | 0.3736264 | 0.6349869 | 0.0353247 | 0.8422511 | 0.7079168 | 1.2031216 | -29.2083155 | 20.3121585 | |
15 | 0.3570956 | 0.7499543 | 0.3864469 | 0.6184499 | 0.0365368 | 0.8787879 | 0.7322081 | 1.1717886 | -26.7791891 | 17.1788566 | |
16 | 0.3277598 | 0.8000366 | 0.3485401 | 0.6015536 | 0.0330736 | 0.9118615 | 0.6603855 | 1.1397748 | -33.9614497 | 13.9774757 | |
17 | 0.2981756 | 0.8500274 | 0.2888483 | 0.5831631 | 0.0273593 | 0.9392208 | 0.5472862 | 1.1049300 | -45.2713819 | 10.4929982 | |
18 | 0.2699267 | 0.8999269 | 0.2637363 | 0.5654514 | 0.0249351 | 0.9641558 | 0.4997060 | 1.0713713 | -50.0293992 | 7.1371306 | |
19 | 0.2239691 | 0.9499177 | 0.2230347 | 0.5474312 | 0.0211255 | 0.9852814 | 0.4225881 | 1.0372281 | -57.7411936 | 3.7228104 | |
20 | 0.0694869 | 1.0 | 0.1551095 | 0.5277829 | 0.0147186 | 1.0 | 0.2938888 | 1.0 | -70.6111164 | 0.0 |