import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day. Use 10 million rows of historical
# data, and eventually add weather data.
# Connect to a cluster
h2o.init()
Checking whether there is an H2O instance running at http://localhost:54321. connected.
H2O cluster uptime: | 08 secs |
H2O cluster version: | 3.11.0.99999 |
H2O cluster version age: | 1 minute |
H2O cluster name: | pasha |
H2O cluster total nodes: | 1 |
H2O cluster free memory: | 3.556 Gb |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster status: | accepting new members, healthy |
H2O connection url: | http://localhost:54321 |
H2O connection proxy: | None |
Python version: | 3.5.2 final |
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False
def mylocate(s):
if data_source_is_s3:
return "s3n://h2o-public-test-data/" + s
else:
return _locate(s)
# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test = [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]
# ----------
# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end
# station, trip duration and trip start time and day. The larger dataset
# totals about 10 million rows
print("Import and Parse bike data")
data = h2o.import_file(path=small_test)
Import and Parse bike data Parse progress: |█████████████████████████████████████████████████████████| 100%
# ----------
# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.
# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay = 1000 * 3600 * 24
data["Days"] = (startime.asnumeric() / secsPerDay).floor()
data.describe()
Rows:1037712 Cols:16
tripduration | starttime | stoptime | start station id | start station name | start station latitude | start station longitude | end station id | end station name | end station latitude | end station longitude | bikeid | usertype | birth year | gender | Days | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | time | time | int | enum | real | real | int | enum | real | real | int | enum | int | int | int |
mins | 60.0 | 1380585668000.0 | 1380585883000.0 | 72.0 | 40.680342423 | -74.01713445 | 72.0 | 40.680342423 | -74.01713445 | 14529.0 | 1899.0 | 0.0 | 15979.0 | |||
mean | 825.6147543827192 | 1381888516917.714 | 1381889342532.4746 | 443.7142126139049 | 40.73451885864454 | -73.99113288482197 | 443.20742171238254 | 40.73428478848875 | -73.99127029824423 | 17644.071645119242 | 1975.7783948601839 | 1.123755916863252 | 15993.476745956474 | |||
maxs | 1259480.0 | 1383263997000.0 | 1383393310000.0 | 3002.0 | 40.770513 | -73.9500479759 | 3002.0 | 40.770513 | -73.9500479759 | 20757.0 | 1997.0 | 2.0 | 16009.0 | |||
sigma | 2000.3732322961862 | 778871729.1323168 | 778847387.5037588 | 354.43432507453724 | 0.01957340730530415 | 0.012316123410581171 | 357.39821705755827 | 0.019557845811587957 | 0.012385581196537298 | 1717.6811213447866 | 11.131490623834942 | 0.544380593291009 | 9.014533519116712 | |||
zeros | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 97498 | 0 | |||
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 97445 | 0 | 0 |
0 | 326.0 | 2013-10-01 00:01:08 | 2013-10-01 00:06:34 | 239.0 | Willoughby St & Fleet St | 40.69196566 | -73.9813018 | 366.0 | Clinton Ave & Myrtle Ave | 40.693261 | -73.968896 | 16052.0 | Subscriber | 1982.0 | 1.0 | 15979.0 |
1 | 729.0 | 2013-10-01 00:01:21 | 2013-10-01 00:13:30 | 322.0 | Clinton St & Tillary St | 40.696191999999996 | -73.991218 | 398.0 | Atlantic Ave & Furman St | 40.69165183 | -73.99997859999999 | 19412.0 | Customer | nan | 0.0 | 15979.0 |
2 | 520.0 | 2013-10-01 00:01:24 | 2013-10-01 00:10:04 | 174.0 | E 25 St & 1 Ave | 40.7381765 | -73.97738662 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 19645.0 | Subscriber | 1984.0 | 1.0 | 15979.0 |
3 | 281.0 | 2013-10-01 00:01:25 | 2013-10-01 00:06:06 | 430.0 | York St & Jay St | 40.7014851 | -73.98656928 | 323.0 | Lawrence St & Willoughby St | 40.69236178 | -73.98631746 | 16992.0 | Subscriber | 1985.0 | 1.0 | 15979.0 |
4 | 196.0 | 2013-10-01 00:01:27 | 2013-10-01 00:04:43 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 401.0 | Allen St & Rivington St | 40.72019576 | -73.98997825000001 | 15690.0 | Subscriber | 1986.0 | 1.0 | 15979.0 |
5 | 1948.0 | 2013-10-01 00:01:48 | 2013-10-01 00:34:16 | 369.0 | Washington Pl & 6 Ave | 40.73224119 | -74.00026394 | 307.0 | Canal St & Rutgers St | 40.714274870000004 | -73.98990025 | 19846.0 | Subscriber | 1977.0 | 1.0 | 15979.0 |
6 | 1327.0 | 2013-10-01 00:01:48 | 2013-10-01 00:23:55 | 254.0 | W 11 St & 6 Ave | 40.73532427 | -73.99800419 | 539.0 | Metropolitan Ave & Bedford Ave | 40.71534825 | -73.96024116 | 14563.0 | Subscriber | 1986.0 | 2.0 | 15979.0 |
7 | 1146.0 | 2013-10-01 00:01:57 | 2013-10-01 00:21:03 | 490.0 | 8 Ave & W 33 St | 40.751551 | -73.993934 | 438.0 | St Marks Pl & 1 Ave | 40.727791260000004 | -73.98564945 | 16793.0 | Subscriber | 1959.0 | 1.0 | 15979.0 |
8 | 380.0 | 2013-10-01 00:01:58 | 2013-10-01 00:08:18 | 468.0 | Broadway & W 55 St | 40.7652654 | -73.98192338 | 385.0 | E 55 St & 2 Ave | 40.757973220000004 | -73.96603308 | 16600.0 | Customer | nan | 0.0 | 15979.0 |
9 | 682.0 | 2013-10-01 00:02:05 | 2013-10-01 00:13:27 | 300.0 | Shevchenko Pl & E 6 St | 40.728145 | -73.990214 | 519.0 | Pershing Square N | 40.75188406 | -73.97770164 | 15204.0 | Subscriber | 1992.0 | 1.0 | 15979.0 |
# Now do a monster Group-By. Count bike starts per-station per-day. Ends up
# with about 340 stations times 400 days (140,000 rows). This is what we want
# to predict.
grouped = data.group_by(["Days","start station name"])
bpd = grouped.count().get_frame() # Compute bikes-per-day
bpd.set_name(2,"bikes")
bpd.show()
bpd.describe()
bpd.dim
Days | start station name | bikes |
---|---|---|
15979 | 1 Ave & E 15 St | 173 |
15979 | 1 Ave & E 18 St | 118 |
15979 | 1 Ave & E 30 St | 152 |
15979 | 10 Ave & W 28 St | 115 |
15979 | 11 Ave & W 27 St | 210 |
15979 | 11 Ave & W 41 St | 106 |
15979 | 12 Ave & W 40 St | 144 |
15979 | 2 Ave & E 31 St | 206 |
15979 | 2 Ave & E 58 St | 105 |
15979 | 3 Ave & Schermerhorn St | 15 |
Rows:10131 Cols:3
Days | start station name | bikes | |
---|---|---|---|
type | int | enum | int |
mins | 15979.0 | 1.0 | |
mean | 15993.953311617806 | 102.42937518507551 | |
maxs | 16009.0 | 603.0 | |
sigma | 8.950698111468864 | 74.05933443246006 | |
zeros | 0 | 0 | |
missing | 0 | 0 | 0 |
0 | 15979.0 | 1 Ave & E 15 St | 173.0 |
1 | 15979.0 | 1 Ave & E 18 St | 118.0 |
2 | 15979.0 | 1 Ave & E 30 St | 152.0 |
3 | 15979.0 | 10 Ave & W 28 St | 115.0 |
4 | 15979.0 | 11 Ave & W 27 St | 210.0 |
5 | 15979.0 | 11 Ave & W 41 St | 106.0 |
6 | 15979.0 | 12 Ave & W 40 St | 144.0 |
7 | 15979.0 | 2 Ave & E 31 St | 206.0 |
8 | 15979.0 | 2 Ave & E 58 St | 105.0 |
9 | 15979.0 | 3 Ave & Schermerhorn St | 15.0 |
[10131, 3]
# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print("Quantiles of bikes-per-day")
bpd["bikes"].quantile().show()
Quantiles of bikes-per-day
Probs | bikesQuantiles |
---|---|
0.01 | 5 |
0.1 | 20 |
0.25 | 45 |
0.333 | 60 |
0.5 | 91 |
0.667 | 121 |
0.75 | 141 |
0.9 | 197 |
0.99 | 340.4 |
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"] = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print("Bikes-Per-Day")
bpd.describe()
Bikes-Per-Day Rows:10131 Cols:5
Days | start station name | bikes | Month | DayOfWeek | |
---|---|---|---|---|---|
type | int | enum | int | enum | enum |
mins | 15979.0 | 1.0 | |||
mean | 15993.953311617806 | 102.42937518507551 | |||
maxs | 16009.0 | 603.0 | |||
sigma | 8.950698111468864 | 74.05933443246006 | |||
zeros | 0 | 0 | |||
missing | 0 | 0 | 0 | 0 | 0 |
0 | 15979.0 | 1 Ave & E 15 St | 173.0 | 10 | Tue |
1 | 15979.0 | 1 Ave & E 18 St | 118.0 | 10 | Tue |
2 | 15979.0 | 1 Ave & E 30 St | 152.0 | 10 | Tue |
3 | 15979.0 | 10 Ave & W 28 St | 115.0 | 10 | Tue |
4 | 15979.0 | 11 Ave & W 27 St | 210.0 | 10 | Tue |
5 | 15979.0 | 11 Ave & W 41 St | 106.0 | 10 | Tue |
6 | 15979.0 | 12 Ave & W 40 St | 144.0 | 10 | Tue |
7 | 15979.0 | 2 Ave & E 31 St | 206.0 | 10 | Tue |
8 | 15979.0 | 2 Ave & E 58 St | 105.0 | 10 | Tue |
9 | 15979.0 | 3 Ave & Schermerhorn St | 15.0 | 10 | Tue |
# ----------
# 3- Fit a model on train; using test as validation
# Function for doing class test/train/holdout split
def split_fit_predict(data):
global gbm0,drf0,glm0,dl0
# Classic Test/Train split
r = data['Days'].runif() # Random UNIForm numbers, one per row
train = data[ r < 0.6]
test = data[(0.6 <= r) & (r < 0.9)]
hold = data[ 0.9 <= r ]
print("Training data has",train.ncol,"columns and",train.nrow,"rows, test has",test.nrow,"rows, holdout has",hold.nrow)
bike_names_x = data.names
bike_names_x.remove("bikes")
# Run GBM
s = time.time()
gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well
max_depth=6,
learn_rate=0.1)
gbm0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
gbm_elapsed = time.time() - s
# Run DRF
s = time.time()
drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)
drf0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
drf_elapsed = time.time() - s
# Run GLM
if "WC1" in bike_names_x: bike_names_x.remove("WC1")
s = time.time()
glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
glm0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
glm_elapsed = time.time() - s
# Run DL
s = time.time()
dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)
dl0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
dl_elapsed = time.time() - s
# ----------
# 4- Score on holdout set & report
train_mse_gbm = gbm0.model_performance(train).mse()
test_mse_gbm = gbm0.model_performance(test ).mse()
hold_mse_gbm = gbm0.model_performance(hold ).mse()
# print "GBM mse TRAIN=",train_mse_gbm,", mse TEST=",test_mse_gbm,", mse HOLDOUT=",hold_mse_gbm
train_mse_drf = drf0.model_performance(train).mse()
test_mse_drf = drf0.model_performance(test ).mse()
hold_mse_drf = drf0.model_performance(hold ).mse()
# print "DRF mse TRAIN=",train_mse_drf,", mse TEST=",test_mse_drf,", mse HOLDOUT=",hold_mse_drf
train_mse_glm = glm0.model_performance(train).mse()
test_mse_glm = glm0.model_performance(test ).mse()
hold_mse_glm = glm0.model_performance(hold ).mse()
# print "GLM mse TRAIN=",train_mse_glm,", mse TEST=",test_mse_glm,", mse HOLDOUT=",hold_mse_glm
train_mse_dl = dl0.model_performance(train).mse()
test_mse_dl = dl0.model_performance(test ).mse()
hold_mse_dl = dl0.model_performance(hold ).mse()
# print " DL mse TRAIN=",train_mse_dl,", mse TEST=",test_mse_dl,", mse HOLDOUT=",hold_mse_dl
# make a pretty HTML table printout of the results
header = ["Model", "mse TRAIN", "mse TEST", "mse HOLDOUT", "Model Training Time (s)"]
table = [
["GBM", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],
["DRF", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],
["GLM", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],
["DL ", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],
]
h2o.display.H2ODisplay(table,header)
# --------------
# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.
Training data has 5 columns and 6180 rows, test has 2947 rows, holdout has 1004 gbm Model Build progress: |███████████████████████████████████████████████| 100% drf Model Build progress: |███████████████████████████████████████████████| 100% glm Model Build progress: |███████████████████████████████████████████████| 100% deeplearning Model Build progress: |██████████████████████████████████████| 100%
Model | mse TRAIN | mse TEST | mse HOLDOUT | Model Training Time (s) |
GBM | 0.8948171 | 386.7584398 | 428.7237120 | 7.759 |
DRF | 526.3541524 | 921.4867812 | 916.5091361 | 8.673 |
GLM | 689.6647078 | 757.4271445 | 726.9764530 | 0.522 |
DL | 307.5692122 | 459.6025357 | 509.2822086 | 8.619 |
# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()
Parse progress: |█████████████████████████████████████████████████████████| 100% Rows:17520 Cols:50
Year Local | Month Local | Day Local | Hour Local | Year UTC | Month UTC | Day UTC | Hour UTC | Cavok Reported | Cloud Ceiling (m) | Cloud Cover Fraction | Cloud Cover Fraction 1 | Cloud Cover Fraction 2 | Cloud Cover Fraction 3 | Cloud Cover Fraction 4 | Cloud Cover Fraction 5 | Cloud Cover Fraction 6 | Cloud Height (m) 1 | Cloud Height (m) 2 | Cloud Height (m) 3 | Cloud Height (m) 4 | Cloud Height (m) 5 | Cloud Height (m) 6 | Dew Point (C) | Humidity Fraction | Precipitation One Hour (mm) | Pressure Altimeter (mbar) | Pressure Sea Level (mbar) | Pressure Station (mbar) | Snow Depth (cm) | Temperature (C) | Visibility (km) | Weather Code 1 | Weather Code 1/ Description | Weather Code 2 | Weather Code 2/ Description | Weather Code 3 | Weather Code 3/ Description | Weather Code 4 | Weather Code 4/ Description | Weather Code 5 | Weather Code 5/ Description | Weather Code 6 | Weather Code 6/ Description | Weather Code Most Severe / Icon Code | Weather Code Most Severe | Weather Code Most Severe / Description | Wind Direction (degrees) | Wind Gust (m/s) | Wind Speed (m/s) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | int | int | int | int | int | real | real | real | real | real | int | int | int | real | real | real | int | int | int | real | real | real | real | int | int | int | real | real | int | enum | int | enum | int | enum | int | enum | int | enum | int | enum | int | int | enum | int | real | real |
mins | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 0.0 | 0.0 | 61.0 | 0.0 | 0.0 | 0.25 | 0.5 | NaN | NaN | NaN | 60.96 | 213.36 | 365.76 | NaN | NaN | NaN | -26.700000000000003 | 0.12510000000000002 | 0.0 | 983.2949000000001 | NaN | NaN | NaN | -15.600000000000001 | 0.001 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | 0.0 | 1.0 | 10.0 | 7.2 | 0.0 | |||||||
mean | 2013.5 | 6.5260273972602745 | 15.72054794520548 | 11.500000000000004 | 2013.5005707762557 | 6.525114155251141 | 15.72134703196347 | 11.500114155251142 | 0.0 | 1306.3119584569736 | 0.4167424905220181 | 0.3612073490813649 | 0.8724453840732911 | 0.9630456852791879 | 0.0 | 0.0 | 0.0 | 1293.9822681953192 | 1643.7390016566796 | 2084.8938637563456 | 0.0 | 0.0 | 0.0 | 4.313046467655992 | 0.5967363891594567 | 1.3799301075268817 | 1017.8258144055944 | 0.0 | 0.0 | 0.0 | 12.578909070073914 | 14.391442968202009 | 4.84251968503937 | 3.6586768935762226 | 2.8466076696165192 | 2.0114942528735633 | 4.125 | 3.0 | 1.3784817351598173 | 4.84251968503937 | 194.69525681985743 | 9.422169480726348 | 2.4103288784874057 | |||||||
maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 2015.0 | 12.0 | 31.0 | 23.0 | 0.0 | 3657.6000000000004 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | 3657.5999 | 3657.5999 | 3657.5999 | NaN | NaN | NaN | 24.400000000000002 | 1.0 | 26.924 | 1042.2113 | NaN | NaN | NaN | 36.1 | 16.0934 | 60.0 | 60.0 | 36.0 | 27.0 | 27.0 | 3.0 | 16.0 | 60.0 | 360.0 | 20.580000000000002 | 10.8 | |||||||
sigma | 0.500014270017262 | 3.447949723847773 | 8.796498048523272 | 6.922384111875021 | 0.50058441171579 | 3.447824054577647 | 8.795614888684717 | 6.922301652025526 | 0.0 | 995.3398569657211 | 0.4627208309925301 | 0.42770569708047684 | 0.19715569036704708 | 0.08610155981044185 | -0.0 | -0.0 | -0.0 | 962.7430958537232 | 916.7386134899587 | 887.2158475113932 | -0.0 | -0.0 | -0.0 | 10.973128209713666 | 0.18579201186573496 | 2.5621512917896463 | 7.464516971789659 | -0.0 | -0.0 | -0.0 | 10.039673953091574 | 3.6989362303340494 | 5.704865769828319 | 6.133862539123368 | 5.805532863642112 | 3.1234084426128437 | 6.15223536610881 | 0.0 | 4.073860627017756 | 5.704865769828319 | 106.3500000314393 | 1.8151187111524154 | 1.614697905241178 | |||||||
zeros | 0 | 0 | 0 | 730 | 0 | 0 | 0 | 730 | 17455 | 0 | 8758 | 8758 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 268 | 0 | 501 | 0 | 0 | 0 | 0 | 269 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14980 | 0 | 0 | 0 | 2768 | |||||||
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 65 | 10780 | 375 | 375 | 14682 | 16535 | 17520 | 17520 | 17520 | 9103 | 14683 | 16535 | 17520 | 17520 | 17520 | 67 | 67 | 15660 | 360 | 17520 | 17520 | 17520 | 67 | 412 | 14980 | 14980 | 16477 | 16477 | 17181 | 17181 | 17433 | 17433 | 17504 | 17504 | 17518 | 17518 | 0 | 14980 | 14980 | 9382 | 14381 | 1283 |
0 | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 5.0 | 0.0 | 2895.6000000000004 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 2895.5999 | 3352.8 | nan | nan | nan | nan | -5.0 | 0.5447000000000001 | nan | 1013.0917000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | nan | 2.57 | |||||||
1 | 2013.0 | 1.0 | 1.0 | 1.0 | 2013.0 | 1.0 | 1.0 | 6.0 | 0.0 | 3048.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 3048.0 | nan | nan | nan | nan | nan | -4.4 | 0.5463 | nan | 1012.0759 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | 9.77 | 4.63 | |||||||
2 | 2013.0 | 1.0 | 1.0 | 2.0 | 2013.0 | 1.0 | 1.0 | 7.0 | 0.0 | 1828.8000000000002 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1828.7999 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.619 | nan | 1012.4145000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | 7.72 | 1.54 | |||||||
3 | 2013.0 | 1.0 | 1.0 | 3.0 | 2013.0 | 1.0 | 1.0 | 8.0 | 0.0 | 1463.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1463.04 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | nan | 3.09 | |||||||
4 | 2013.0 | 1.0 | 1.0 | 4.0 | 2013.0 | 1.0 | 1.0 | 9.0 | 0.0 | 1402.1000000000001 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1402.0800000000002 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.7531 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | nan | 4.12 | |||||||
5 | 2013.0 | 1.0 | 1.0 | 5.0 | 2013.0 | 1.0 | 1.0 | 10.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | nan | 3.09 | |||||||
6 | 2013.0 | 1.0 | 1.0 | 6.0 | 2013.0 | 1.0 | 1.0 | 11.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.5934 | nan | 1012.0759 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | 9.26 | 3.09 | |||||||
7 | 2013.0 | 1.0 | 1.0 | 7.0 | 2013.0 | 1.0 | 1.0 | 12.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.5934 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | 9.26 | 4.63 | |||||||
8 | 2013.0 | 1.0 | 1.0 | 8.0 | 2013.0 | 1.0 | 1.0 | 13.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6425000000000001 | nan | 1012.4145000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | nan | 3.09 | |||||||
9 | 2013.0 | 1.0 | 1.0 | 9.0 | 2013.0 | 1.0 | 1.0 | 14.0 | 0.0 | 1524.0 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 1524.0 | 3657.5999 | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | 9.26 | 3.09 |
# Lots of columns in there! Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature. Alas, drop the "snow" column since it's
# all NA's. Also add in dew point and humidity just in case. Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]
wthr2.set_name(wthr2.names.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.names.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!
Rows:17520 Cols:9
Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | |
---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | real | real | real | real | enum |
mins | 2013.0 | 1.0 | 1.0 | 0.0 | -26.700000000000003 | 0.12510000000000002 | 0.0 | -15.600000000000001 | |
mean | 2013.5 | 6.5260273972602745 | 15.72054794520548 | 11.500000000000004 | 4.313046467655992 | 0.5967363891594567 | 1.3799301075268817 | 12.578909070073914 | |
maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 24.400000000000002 | 1.0 | 26.924 | 36.1 | |
sigma | 0.500014270017262 | 3.447949723847773 | 8.796498048523272 | 6.922384111875021 | 10.973128209713666 | 0.18579201186573496 | 2.5621512917896463 | 10.039673953091574 | |
zeros | 0 | 0 | 0 | 730 | 268 | 0 | 501 | 269 | |
missing | 0 | 0 | 0 | 0 | 67 | 67 | 15660 | 67 | 14980 |
0 | 2013.0 | 1.0 | 1.0 | 0.0 | -5.0 | 0.5447000000000001 | nan | 3.3000000000000003 | |
1 | 2013.0 | 1.0 | 1.0 | 1.0 | -4.4 | 0.5463 | nan | 3.9000000000000004 | |
2 | 2013.0 | 1.0 | 1.0 | 2.0 | -3.3000000000000003 | 0.619 | nan | 3.3000000000000003 | |
3 | 2013.0 | 1.0 | 1.0 | 3.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
4 | 2013.0 | 1.0 | 1.0 | 4.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
5 | 2013.0 | 1.0 | 1.0 | 5.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
6 | 2013.0 | 1.0 | 1.0 | 6.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | |
7 | 2013.0 | 1.0 | 1.0 | 7.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | |
8 | 2013.0 | 1.0 | 1.0 | 8.0 | -2.8000000000000003 | 0.6425000000000001 | nan | 3.3000000000000003 | |
9 | 2013.0 | 1.0 | 1.0 | 9.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 |
# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]
# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days. Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970). Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.
Rows:730 Cols:11
Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | msec | Days | |
---|---|---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | real | real | real | real | enum | int | int |
mins | 2013.0 | 1.0 | 1.0 | 12.0 | -26.700000000000003 | 0.1723 | 0.0 | -13.9 | 1357070400000.0 | 15706.0 | |
mean | 2013.5 | 6.526027397260274 | 15.72054794520548 | 12.0 | 4.230123796423659 | 0.539728198074278 | 1.5312571428571429 | 14.068775790921595 | 1388560852602.7397 | 16070.5 | |
maxs | 2014.0 | 12.0 | 31.0 | 12.0 | 23.3 | 1.0 | 12.446 | 34.4 | 1420056000000.0 | 16435.0 | |
sigma | 0.5003428180039172 | 3.450215293068149 | 8.802278027009615 | 0.0 | 11.106296472475226 | 0.17994502792324327 | 2.3606424861490587 | 10.398985514891212 | 18219740080.410755 | 210.87713642466474 | |
zeros | 0 | 0 | 0 | 0 | 14 | 0 | 15 | 7 | 0 | 0 | |
missing | 0 | 0 | 0 | 0 | 3 | 3 | 660 | 3 | 620 | 0 | 0 |
0 | 2013.0 | 1.0 | 1.0 | 12.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | 1357070400000.0 | 15706.0 | |
1 | 2013.0 | 1.0 | 2.0 | 12.0 | -11.700000000000001 | 0.4806 | nan | -2.2 | 1357156800000.0 | 15707.0 | |
2 | 2013.0 | 1.0 | 3.0 | 12.0 | -10.600000000000001 | 0.5248 | nan | -2.2 | 1357243200000.0 | 15708.0 | |
3 | 2013.0 | 1.0 | 4.0 | 12.0 | -7.2 | 0.49760000000000004 | nan | 2.2 | 1357329600000.0 | 15709.0 | |
4 | 2013.0 | 1.0 | 5.0 | 12.0 | -7.2 | 0.42600000000000005 | nan | 4.4 | 1357416000000.0 | 15710.0 | |
5 | 2013.0 | 1.0 | 6.0 | 12.0 | -1.7000000000000002 | 0.6451 | nan | 4.4 | haze | 1357502400000.0 | 15711.0 |
6 | 2013.0 | 1.0 | 7.0 | 12.0 | -6.1000000000000005 | 0.41190000000000004 | nan | 6.1000000000000005 | 1357588800000.0 | 15712.0 | |
7 | 2013.0 | 1.0 | 8.0 | 12.0 | -1.7000000000000002 | 0.5314 | nan | 7.2 | 1357675200000.0 | 15713.0 | |
8 | 2013.0 | 1.0 | 9.0 | 12.0 | 0.6000000000000001 | 0.56 | nan | 8.9 | haze | 1357761600000.0 | 15714.0 |
9 | 2013.0 | 1.0 | 10.0 | 12.0 | -6.1000000000000005 | 0.3952 | nan | 6.7 | 1357848000000.0 | 15715.0 |
# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")
# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0
wthr4["Rain (mm)"] = rain
# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print("Merge Daily Weather with Bikes-Per-Day")
bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)
bpd_with_weather.describe()
bpd_with_weather.show()
Merge Daily Weather with Bikes-Per-Day Rows:10131 Cols:10
Days | start station name | bikes | Month | DayOfWeek | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | |
---|---|---|---|---|---|---|---|---|---|---|
type | int | enum | int | enum | enum | real | real | real | real | enum |
mins | 15979.0 | 1.0 | -2.2 | 0.34850000000000003 | 0.0 | 9.4 | ||||
mean | 15993.953311617806 | 102.42937518507551 | 7.60732405488106 | 0.5564958839206396 | 0.008198400947586611 | 16.937094067712962 | ||||
maxs | 16009.0 | 603.0 | 19.400000000000002 | 0.8718 | 0.254 | 26.1 | ||||
sigma | 8.950698111468864 | 74.05933443246006 | 6.516386487040385 | 0.14811201086649933 | 0.04489297266255909 | 4.362687300129602 | ||||
zeros | 0 | 0 | 0 | 0 | 9804 | 0 | ||||
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8816 |
0 | 15979.0 | 1 Ave & E 15 St | 173.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
1 | 15979.0 | 1 Ave & E 18 St | 118.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
2 | 15979.0 | 1 Ave & E 30 St | 152.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
3 | 15979.0 | 10 Ave & W 28 St | 115.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
4 | 15979.0 | 11 Ave & W 27 St | 210.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
5 | 15979.0 | 11 Ave & W 41 St | 106.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
6 | 15979.0 | 12 Ave & W 40 St | 144.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
7 | 15979.0 | 2 Ave & E 31 St | 206.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
8 | 15979.0 | 2 Ave & E 58 St | 105.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
9 | 15979.0 | 3 Ave & Schermerhorn St | 15.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 |
Days | start station name | bikes | Month | DayOfWeek | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
---|---|---|---|---|---|---|---|---|---|
15979 | 1 Ave & E 15 St | 173 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 1 Ave & E 18 St | 118 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 1 Ave & E 30 St | 152 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 10 Ave & W 28 St | 115 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 11 Ave & W 27 St | 210 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 11 Ave & W 41 St | 106 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 12 Ave & W 40 St | 144 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 2 Ave & E 31 St | 206 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 2 Ave & E 58 St | 105 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
15979 | 3 Ave & Schermerhorn St | 15 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 |
# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)
Training data has 10 columns and 6066 rows, test has 3044 rows, holdout has 1021 gbm Model Build progress: |███████████████████████████████████████████████| 100% drf Model Build progress: |███████████████████████████████████████████████| 100% glm Model Build progress: |███████████████████████████████████████████████| 100% deeplearning Model Build progress: |██████████████████████████████████████| 100%
Model | mse TRAIN | mse TEST | mse HOLDOUT | Model Training Time (s) |
GBM | 0.2159977 | 393.0248269 | 404.2520310 | 8.679 |
DRF | 804.2152039 | 1703.1540562 | 1782.0854925 | 6.573 |
GLM | 620.8814844 | 735.9622856 | 789.7891737 | 0.241 |
DL | 213.8582644 | 454.7871732 | 476.5995571 | 7.518 |