In [1]:

import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [2]:

# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.

H2O cluster uptime:	08 secs
H2O cluster version:	3.11.0.99999
H2O cluster version age:	1 minute
H2O cluster name:	pasha
H2O cluster total nodes:	1
H2O cluster free memory:	3.556 Gb
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster status:	accepting new members, healthy
H2O connection url:	http://localhost:54321
H2O connection proxy:	None
Python version:	3.5.2 final

In [3]:

from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False

def mylocate(s):
    if data_source_is_s3:
        return "s3n://h2o-public-test-data/" + s
    else:
        return _locate(s)

In [4]:

# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test =   [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print("Import and Parse bike data")
data = h2o.import_file(path=small_test)

Import and Parse bike data
Parse progress: |█████████████████████████████████████████████████████████| 100%

In [5]:

# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay = 1000 * 3600 * 24
data["Days"] = (startime.asnumeric() / secsPerDay).floor()
data.describe()

Rows:1037712
Cols:16

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	bikeid	usertype	birth year	gender	Days
type	int	time	time	int	enum	real	real	int	enum	real	real	int	enum	int	int	int
mins	60.0	1380585668000.0	1380585883000.0	72.0		40.680342423	-74.01713445	72.0		40.680342423	-74.01713445	14529.0		1899.0	0.0	15979.0
mean	825.6147543827192	1381888516917.714	1381889342532.4746	443.7142126139049		40.73451885864454	-73.99113288482197	443.20742171238254		40.73428478848875	-73.99127029824423	17644.071645119242		1975.7783948601839	1.123755916863252	15993.476745956474
maxs	1259480.0	1383263997000.0	1383393310000.0	3002.0		40.770513	-73.9500479759	3002.0		40.770513	-73.9500479759	20757.0		1997.0	2.0	16009.0
sigma	2000.3732322961862	778871729.1323168	778847387.5037588	354.43432507453724		0.01957340730530415	0.012316123410581171	357.39821705755827		0.019557845811587957	0.012385581196537298	1717.6811213447866		11.131490623834942	0.544380593291009	9.014533519116712
zeros	0	0	0	0		0	0	0		0	0	0		0	97498	0
missing	0	0	0	0	0	0	0	0	0	0	0	0	0	97445	0	0
0	326.0	2013-10-01 00:01:08	2013-10-01 00:06:34	239.0	Willoughby St & Fleet St	40.69196566	-73.9813018	366.0	Clinton Ave & Myrtle Ave	40.693261	-73.968896	16052.0	Subscriber	1982.0	1.0	15979.0
1	729.0	2013-10-01 00:01:21	2013-10-01 00:13:30	322.0	Clinton St & Tillary St	40.696191999999996	-73.991218	398.0	Atlantic Ave & Furman St	40.69165183	-73.99997859999999	19412.0	Customer	nan	0.0	15979.0
2	520.0	2013-10-01 00:01:24	2013-10-01 00:10:04	174.0	E 25 St & 1 Ave	40.7381765	-73.97738662	403.0	E 2 St & 2 Ave	40.72502876	-73.99069656	19645.0	Subscriber	1984.0	1.0	15979.0
3	281.0	2013-10-01 00:01:25	2013-10-01 00:06:06	430.0	York St & Jay St	40.7014851	-73.98656928	323.0	Lawrence St & Willoughby St	40.69236178	-73.98631746	16992.0	Subscriber	1985.0	1.0	15979.0
4	196.0	2013-10-01 00:01:27	2013-10-01 00:04:43	403.0	E 2 St & 2 Ave	40.72502876	-73.99069656	401.0	Allen St & Rivington St	40.72019576	-73.98997825000001	15690.0	Subscriber	1986.0	1.0	15979.0
5	1948.0	2013-10-01 00:01:48	2013-10-01 00:34:16	369.0	Washington Pl & 6 Ave	40.73224119	-74.00026394	307.0	Canal St & Rutgers St	40.714274870000004	-73.98990025	19846.0	Subscriber	1977.0	1.0	15979.0
6	1327.0	2013-10-01 00:01:48	2013-10-01 00:23:55	254.0	W 11 St & 6 Ave	40.73532427	-73.99800419	539.0	Metropolitan Ave & Bedford Ave	40.71534825	-73.96024116	14563.0	Subscriber	1986.0	2.0	15979.0
7	1146.0	2013-10-01 00:01:57	2013-10-01 00:21:03	490.0	8 Ave & W 33 St	40.751551	-73.993934	438.0	St Marks Pl & 1 Ave	40.727791260000004	-73.98564945	16793.0	Subscriber	1959.0	1.0	15979.0
8	380.0	2013-10-01 00:01:58	2013-10-01 00:08:18	468.0	Broadway & W 55 St	40.7652654	-73.98192338	385.0	E 55 St & 2 Ave	40.757973220000004	-73.96603308	16600.0	Customer	nan	0.0	15979.0
9	682.0	2013-10-01 00:02:05	2013-10-01 00:13:27	300.0	Shevchenko Pl & E 6 St	40.728145	-73.990214	519.0	Pershing Square N	40.75188406	-73.97770164	15204.0	Subscriber	1992.0	1.0	15979.0

In [6]:

# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
grouped = data.group_by(["Days","start station name"])
bpd = grouped.count().get_frame() # Compute bikes-per-day
bpd.set_name(2,"bikes")
bpd.show()
bpd.describe()
bpd.dim

Days	start station name	bikes
15979	1 Ave & E 15 St	173
15979	1 Ave & E 18 St	118
15979	1 Ave & E 30 St	152
15979	10 Ave & W 28 St	115
15979	11 Ave & W 27 St	210
15979	11 Ave & W 41 St	106
15979	12 Ave & W 40 St	144
15979	2 Ave & E 31 St	206
15979	2 Ave & E 58 St	105
15979	3 Ave & Schermerhorn St	15

Rows:10131
Cols:3

	Days	start station name	bikes
type	int	enum	int
mins	15979.0		1.0
mean	15993.953311617806		102.42937518507551
maxs	16009.0		603.0
sigma	8.950698111468864		74.05933443246006
zeros	0		0
missing	0	0	0
0	15979.0	1 Ave & E 15 St	173.0
1	15979.0	1 Ave & E 18 St	118.0
2	15979.0	1 Ave & E 30 St	152.0
3	15979.0	10 Ave & W 28 St	115.0
4	15979.0	11 Ave & W 27 St	210.0
5	15979.0	11 Ave & W 41 St	106.0
6	15979.0	12 Ave & W 40 St	144.0
7	15979.0	2 Ave & E 31 St	206.0
8	15979.0	2 Ave & E 58 St	105.0
9	15979.0	3 Ave & Schermerhorn St	15.0

Out[6]:

[10131, 3]

In [7]:

# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print("Quantiles of bikes-per-day")
bpd["bikes"].quantile().show()

Quantiles of bikes-per-day

Probs	bikesQuantiles
0.01	5
0.1	20
0.25	45
0.333	60
0.5	91
0.667	121
0.75	141
0.9	197
0.99	340.4

In [8]:

# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"]     = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print("Bikes-Per-Day")
bpd.describe()

Bikes-Per-Day
Rows:10131
Cols:5

	Days	start station name	bikes	Month	DayOfWeek
type	int	enum	int	enum	enum
mins	15979.0		1.0
mean	15993.953311617806		102.42937518507551
maxs	16009.0		603.0
sigma	8.950698111468864		74.05933443246006
zeros	0		0
missing	0	0	0	0	0
0	15979.0	1 Ave & E 15 St	173.0	10	Tue
1	15979.0	1 Ave & E 18 St	118.0	10	Tue
2	15979.0	1 Ave & E 30 St	152.0	10	Tue
3	15979.0	10 Ave & W 28 St	115.0	10	Tue
4	15979.0	11 Ave & W 27 St	210.0	10	Tue
5	15979.0	11 Ave & W 41 St	106.0	10	Tue
6	15979.0	12 Ave & W 40 St	144.0	10	Tue
7	15979.0	2 Ave & E 31 St	206.0	10	Tue
8	15979.0	2 Ave & E 58 St	105.0	10	Tue
9	15979.0	3 Ave & Schermerhorn St	15.0	10	Tue

In [9]:

# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
  global gbm0,drf0,glm0,dl0
  # Classic Test/Train split
  r = data['Days'].runif()   # Random UNIForm numbers, one per row
  train = data[  r  < 0.6]
  test  = data[(0.6 <= r) & (r < 0.9)]
  hold  = data[ 0.9 <= r ]
  print("Training data has",train.ncol,"columns and",train.nrow,"rows, test has",test.nrow,"rows, holdout has",hold.nrow)
  bike_names_x = data.names
  bike_names_x.remove("bikes")
  
  # Run GBM
  s = time.time()
  
  gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well
                                      max_depth=6,
                                      learn_rate=0.1)
    

  gbm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  gbm_elapsed = time.time() - s

  # Run DRF
  s = time.time()
    
  drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)

  drf0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)
    
  drf_elapsed = time.time() - s 
    
    
  # Run GLM
  if "WC1" in bike_names_x: bike_names_x.remove("WC1")
  s = time.time()

  glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
    
  glm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  glm_elapsed = time.time() - s
  
  # Run DL
  s = time.time()

  dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)
    
  dl0.train(x               =bike_names_x,
            y               ="bikes",
            training_frame  =train,
            validation_frame=test)
    
  dl_elapsed = time.time() - s
  
  # ----------
  # 4- Score on holdout set & report
  train_mse_gbm = gbm0.model_performance(train).mse()
  test_mse_gbm  = gbm0.model_performance(test ).mse()
  hold_mse_gbm  = gbm0.model_performance(hold ).mse()
#   print "GBM mse TRAIN=",train_mse_gbm,", mse TEST=",test_mse_gbm,", mse HOLDOUT=",hold_mse_gbm
  
  train_mse_drf = drf0.model_performance(train).mse()
  test_mse_drf  = drf0.model_performance(test ).mse()
  hold_mse_drf  = drf0.model_performance(hold ).mse()
#   print "DRF mse TRAIN=",train_mse_drf,", mse TEST=",test_mse_drf,", mse HOLDOUT=",hold_mse_drf
  
  train_mse_glm = glm0.model_performance(train).mse()
  test_mse_glm  = glm0.model_performance(test ).mse()
  hold_mse_glm  = glm0.model_performance(hold ).mse()
#   print "GLM mse TRAIN=",train_mse_glm,", mse TEST=",test_mse_glm,", mse HOLDOUT=",hold_mse_glm
    
  train_mse_dl = dl0.model_performance(train).mse()
  test_mse_dl  = dl0.model_performance(test ).mse()
  hold_mse_dl  = dl0.model_performance(hold ).mse()
#   print " DL mse TRAIN=",train_mse_dl,", mse TEST=",test_mse_dl,", mse HOLDOUT=",hold_mse_dl
    
  # make a pretty HTML table printout of the results

  header = ["Model", "mse TRAIN", "mse TEST", "mse HOLDOUT", "Model Training Time (s)"]
  table  = [
            ["GBM", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],
            ["DRF", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],
            ["GLM", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],
            ["DL ", train_mse_dl,  test_mse_dl,  hold_mse_dl , round(dl_elapsed,3) ],
           ]
  h2o.display.H2ODisplay(table,header)
  # --------------

In [10]:

# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.

Training data has 5 columns and 6180 rows, test has 2947 rows, holdout has 1004
gbm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%

Model	mse TRAIN	mse TEST	mse HOLDOUT	Model Training Time (s)
GBM	0.8948171	386.7584398	428.7237120	7.759
DRF	526.3541524	921.4867812	916.5091361	8.673
GLM	689.6647078	757.4271445	726.9764530	0.522
DL	307.5692122	459.6025357	509.2822086	8.619

In [11]:

# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
                               mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:17520
Cols:50

	Year Local	Month Local	Day Local	Hour Local	Year UTC	Month UTC	Day UTC	Hour UTC	Cavok Reported	Cloud Ceiling (m)	Cloud Cover Fraction	Cloud Cover Fraction 1	Cloud Cover Fraction 2	Cloud Cover Fraction 3	Cloud Cover Fraction 4	Cloud Cover Fraction 5	Cloud Cover Fraction 6	Cloud Height (m) 1	Cloud Height (m) 2	Cloud Height (m) 3	Cloud Height (m) 4	Cloud Height (m) 5	Cloud Height (m) 6	Dew Point (C)	Humidity Fraction	Precipitation One Hour (mm)	Pressure Altimeter (mbar)	Pressure Sea Level (mbar)	Pressure Station (mbar)	Snow Depth (cm)	Temperature (C)	Visibility (km)	Weather Code 1	Weather Code 1/ Description	Weather Code 2	Weather Code 2/ Description	Weather Code 3	Weather Code 3/ Description	Weather Code 4	Weather Code 4/ Description	Weather Code 5	Weather Code 5/ Description	Weather Code 6	Weather Code 6/ Description	Weather Code Most Severe / Icon Code	Weather Code Most Severe	Weather Code Most Severe / Description	Wind Direction (degrees)	Wind Gust (m/s)	Wind Speed (m/s)
type	int	int	int	int	int	int	int	int	int	real	real	real	real	real	int	int	int	real	real	real	int	int	int	real	real	real	real	int	int	int	real	real	int	enum	int	enum	int	enum	int	enum	int	enum	int	enum	int	int	enum	int	real	real
mins	2013.0	1.0	1.0	0.0	2013.0	1.0	1.0	0.0	0.0	61.0	0.0	0.0	0.25	0.5	NaN	NaN	NaN	60.96	213.36	365.76	NaN	NaN	NaN	-26.700000000000003	0.12510000000000002	0.0	983.2949000000001	NaN	NaN	NaN	-15.600000000000001	0.001	1.0		1.0		1.0		1.0		1.0		3.0		0.0	1.0		10.0	7.2	0.0
mean	2013.5	6.5260273972602745	15.72054794520548	11.500000000000004	2013.5005707762557	6.525114155251141	15.72134703196347	11.500114155251142	0.0	1306.3119584569736	0.4167424905220181	0.3612073490813649	0.8724453840732911	0.9630456852791879	0.0	0.0	0.0	1293.9822681953192	1643.7390016566796	2084.8938637563456	0.0	0.0	0.0	4.313046467655992	0.5967363891594567	1.3799301075268817	1017.8258144055944	0.0	0.0	0.0	12.578909070073914	14.391442968202009	4.84251968503937		3.6586768935762226		2.8466076696165192		2.0114942528735633		4.125		3.0		1.3784817351598173	4.84251968503937		194.69525681985743	9.422169480726348	2.4103288784874057
maxs	2014.0	12.0	31.0	23.0	2015.0	12.0	31.0	23.0	0.0	3657.6000000000004	1.0	1.0	1.0	1.0	NaN	NaN	NaN	3657.5999	3657.5999	3657.5999	NaN	NaN	NaN	24.400000000000002	1.0	26.924	1042.2113	NaN	NaN	NaN	36.1	16.0934	60.0		60.0		36.0		27.0		27.0		3.0		16.0	60.0		360.0	20.580000000000002	10.8
sigma	0.500014270017262	3.447949723847773	8.796498048523272	6.922384111875021	0.50058441171579	3.447824054577647	8.795614888684717	6.922301652025526	0.0	995.3398569657211	0.4627208309925301	0.42770569708047684	0.19715569036704708	0.08610155981044185	-0.0	-0.0	-0.0	962.7430958537232	916.7386134899587	887.2158475113932	-0.0	-0.0	-0.0	10.973128209713666	0.18579201186573496	2.5621512917896463	7.464516971789659	-0.0	-0.0	-0.0	10.039673953091574	3.6989362303340494	5.704865769828319		6.133862539123368		5.805532863642112		3.1234084426128437		6.15223536610881		0.0		4.073860627017756	5.704865769828319		106.3500000314393	1.8151187111524154	1.614697905241178
zeros	0	0	0	730	0	0	0	730	17455	0	8758	8758	0	0	0	0	0	0	0	0	0	0	0	268	0	501	0	0	0	0	269	0	0		0		0		0		0		0		14980	0		0	0	2768
missing	0	0	0	0	0	0	0	0	65	10780	375	375	14682	16535	17520	17520	17520	9103	14683	16535	17520	17520	17520	67	67	15660	360	17520	17520	17520	67	412	14980	14980	16477	16477	17181	17181	17433	17433	17504	17504	17518	17518	0	14980	14980	9382	14381	1283
0	2013.0	1.0	1.0	0.0	2013.0	1.0	1.0	5.0	0.0	2895.6000000000004	1.0	0.9	1.0	nan	nan	nan	nan	2895.5999	3352.8	nan	nan	nan	nan	-5.0	0.5447000000000001	nan	1013.0917000000001	nan	nan	nan	3.3000000000000003	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	nan	2.57
1	2013.0	1.0	1.0	1.0	2013.0	1.0	1.0	6.0	0.0	3048.0	1.0	1.0	nan	nan	nan	nan	nan	3048.0	nan	nan	nan	nan	nan	-4.4	0.5463	nan	1012.0759	nan	nan	nan	3.9000000000000004	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	9.77	4.63
2	2013.0	1.0	1.0	2.0	2013.0	1.0	1.0	7.0	0.0	1828.8000000000002	1.0	1.0	nan	nan	nan	nan	nan	1828.7999	nan	nan	nan	nan	nan	-3.3000000000000003	0.619	nan	1012.4145000000001	nan	nan	nan	3.3000000000000003	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	7.72	1.54
3	2013.0	1.0	1.0	3.0	2013.0	1.0	1.0	8.0	0.0	1463.0	1.0	1.0	nan	nan	nan	nan	nan	1463.04	nan	nan	nan	nan	nan	-2.8000000000000003	0.6159	nan	1012.4145000000001	nan	nan	nan	3.9000000000000004	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	nan	3.09
4	2013.0	1.0	1.0	4.0	2013.0	1.0	1.0	9.0	0.0	1402.1000000000001	1.0	1.0	nan	nan	nan	nan	nan	1402.0800000000002	nan	nan	nan	nan	nan	-2.8000000000000003	0.6159	nan	1012.7531	nan	nan	nan	3.9000000000000004	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	nan	4.12
5	2013.0	1.0	1.0	5.0	2013.0	1.0	1.0	10.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-2.8000000000000003	0.6159	nan	1012.4145000000001	nan	nan	nan	3.9000000000000004	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	nan	3.09
6	2013.0	1.0	1.0	6.0	2013.0	1.0	1.0	11.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-3.3000000000000003	0.5934	nan	1012.0759	nan	nan	nan	3.9000000000000004	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	9.26	3.09
7	2013.0	1.0	1.0	7.0	2013.0	1.0	1.0	12.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-3.3000000000000003	0.5934	nan	1012.4145000000001	nan	nan	nan	3.9000000000000004	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	9.26	4.63
8	2013.0	1.0	1.0	8.0	2013.0	1.0	1.0	13.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-2.8000000000000003	0.6425000000000001	nan	1012.4145000000001	nan	nan	nan	3.3000000000000003	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	nan	3.09
9	2013.0	1.0	1.0	9.0	2013.0	1.0	1.0	14.0	0.0	1524.0	1.0	0.9	1.0	nan	nan	nan	nan	1524.0	3657.5999	nan	nan	nan	nan	-2.8000000000000003	0.6159	nan	1012.4145000000001	nan	nan	nan	3.9000000000000004	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	9.26	3.09

In [12]:

# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]

wthr2.set_name(wthr2.names.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.names.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!  

Rows:17520
Cols:9

	Year Local	Month Local	Day Local	Hour Local	Dew Point (C)	Humidity Fraction	Rain (mm)	Temperature (C)	WC1
type	int	int	int	int	real	real	real	real	enum
mins	2013.0	1.0	1.0	0.0	-26.700000000000003	0.12510000000000002	0.0	-15.600000000000001
mean	2013.5	6.5260273972602745	15.72054794520548	11.500000000000004	4.313046467655992	0.5967363891594567	1.3799301075268817	12.578909070073914
maxs	2014.0	12.0	31.0	23.0	24.400000000000002	1.0	26.924	36.1
sigma	0.500014270017262	3.447949723847773	8.796498048523272	6.922384111875021	10.973128209713666	0.18579201186573496	2.5621512917896463	10.039673953091574
zeros	0	0	0	730	268	0	501	269
missing	0	0	0	0	67	67	15660	67	14980
0	2013.0	1.0	1.0	0.0	-5.0	0.5447000000000001	nan	3.3000000000000003
1	2013.0	1.0	1.0	1.0	-4.4	0.5463	nan	3.9000000000000004
2	2013.0	1.0	1.0	2.0	-3.3000000000000003	0.619	nan	3.3000000000000003
3	2013.0	1.0	1.0	3.0	-2.8000000000000003	0.6159	nan	3.9000000000000004
4	2013.0	1.0	1.0	4.0	-2.8000000000000003	0.6159	nan	3.9000000000000004
5	2013.0	1.0	1.0	5.0	-2.8000000000000003	0.6159	nan	3.9000000000000004
6	2013.0	1.0	1.0	6.0	-3.3000000000000003	0.5934	nan	3.9000000000000004
7	2013.0	1.0	1.0	7.0	-3.3000000000000003	0.5934	nan	3.9000000000000004
8	2013.0	1.0	1.0	8.0	-2.8000000000000003	0.6425000000000001	nan	3.3000000000000003
9	2013.0	1.0	1.0	9.0	-2.8000000000000003	0.6159	nan	3.9000000000000004

In [13]:

# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]

In [14]:

# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.  

Rows:730
Cols:11

	Year Local	Month Local	Day Local	Hour Local	Dew Point (C)	Humidity Fraction	Rain (mm)	Temperature (C)	WC1	msec	Days
type	int	int	int	int	real	real	real	real	enum	int	int
mins	2013.0	1.0	1.0	12.0	-26.700000000000003	0.1723	0.0	-13.9		1357070400000.0	15706.0
mean	2013.5	6.526027397260274	15.72054794520548	12.0	4.230123796423659	0.539728198074278	1.5312571428571429	14.068775790921595		1388560852602.7397	16070.5
maxs	2014.0	12.0	31.0	12.0	23.3	1.0	12.446	34.4		1420056000000.0	16435.0
sigma	0.5003428180039172	3.450215293068149	8.802278027009615	0.0	11.106296472475226	0.17994502792324327	2.3606424861490587	10.398985514891212		18219740080.410755	210.87713642466474
zeros	0	0	0	0	14	0	15	7		0	0
missing	0	0	0	0	3	3	660	3	620	0	0
0	2013.0	1.0	1.0	12.0	-3.3000000000000003	0.5934	nan	3.9000000000000004		1357070400000.0	15706.0
1	2013.0	1.0	2.0	12.0	-11.700000000000001	0.4806	nan	-2.2		1357156800000.0	15707.0
2	2013.0	1.0	3.0	12.0	-10.600000000000001	0.5248	nan	-2.2		1357243200000.0	15708.0
3	2013.0	1.0	4.0	12.0	-7.2	0.49760000000000004	nan	2.2		1357329600000.0	15709.0
4	2013.0	1.0	5.0	12.0	-7.2	0.42600000000000005	nan	4.4		1357416000000.0	15710.0
5	2013.0	1.0	6.0	12.0	-1.7000000000000002	0.6451	nan	4.4	haze	1357502400000.0	15711.0
6	2013.0	1.0	7.0	12.0	-6.1000000000000005	0.41190000000000004	nan	6.1000000000000005		1357588800000.0	15712.0
7	2013.0	1.0	8.0	12.0	-1.7000000000000002	0.5314	nan	7.2		1357675200000.0	15713.0
8	2013.0	1.0	9.0	12.0	0.6000000000000001	0.56	nan	8.9	haze	1357761600000.0	15714.0
9	2013.0	1.0	10.0	12.0	-6.1000000000000005	0.3952	nan	6.7		1357848000000.0	15715.0

In [15]:

# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")

In [16]:

# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0
wthr4["Rain (mm)"] = rain

In [17]:

# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print("Merge Daily Weather with Bikes-Per-Day")
bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)
bpd_with_weather.describe()
bpd_with_weather.show()

Merge Daily Weather with Bikes-Per-Day
Rows:10131
Cols:10

	Days	start station name	bikes	Month	DayOfWeek	Dew Point (C)	Humidity Fraction	Rain (mm)	Temperature (C)	WC1
type	int	enum	int	enum	enum	real	real	real	real	enum
mins	15979.0		1.0			-2.2	0.34850000000000003	0.0	9.4
mean	15993.953311617806		102.42937518507551			7.60732405488106	0.5564958839206396	0.008198400947586611	16.937094067712962
maxs	16009.0		603.0			19.400000000000002	0.8718	0.254	26.1
sigma	8.950698111468864		74.05933443246006			6.516386487040385	0.14811201086649933	0.04489297266255909	4.362687300129602
zeros	0		0			0	0	9804	0
missing	0	0	0	0	0	0	0	0	0	8816
0	15979.0	1 Ave & E 15 St	173.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
1	15979.0	1 Ave & E 18 St	118.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
2	15979.0	1 Ave & E 30 St	152.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
3	15979.0	10 Ave & W 28 St	115.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
4	15979.0	11 Ave & W 27 St	210.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
5	15979.0	11 Ave & W 41 St	106.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
6	15979.0	12 Ave & W 40 St	144.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
7	15979.0	2 Ave & E 31 St	206.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
8	15979.0	2 Ave & E 58 St	105.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002
9	15979.0	3 Ave & Schermerhorn St	15.0	10	Tue	10.600000000000001	0.4315	0.0	23.900000000000002

Days	start station name	bikes	Month	DayOfWeek	Dew Point (C)	Humidity Fraction	Temperature (C)
15979	1 Ave & E 15 St	173	10	Tue	10.6	0.4315	23.9
15979	1 Ave & E 18 St	118	10	Tue	10.6	0.4315	23.9
15979	1 Ave & E 30 St	152	10	Tue	10.6	0.4315	23.9
15979	10 Ave & W 28 St	115	10	Tue	10.6	0.4315	23.9
15979	11 Ave & W 27 St	210	10	Tue	10.6	0.4315	23.9
15979	11 Ave & W 41 St	106	10	Tue	10.6	0.4315	23.9
15979	12 Ave & W 40 St	144	10	Tue	10.6	0.4315	23.9
15979	2 Ave & E 31 St	206	10	Tue	10.6	0.4315	23.9
15979	2 Ave & E 58 St	105	10	Tue	10.6	0.4315	23.9
15979	3 Ave & Schermerhorn St	15	10	Tue	10.6	0.4315	23.9

In [18]:

# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)

Training data has 10 columns and 6066 rows, test has 3044 rows, holdout has 1021
gbm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%

Model	mse TRAIN	mse TEST	mse HOLDOUT	Model Training Time (s)
GBM	0.2159977	393.0248269	404.2520310	8.679
DRF	804.2152039	1703.1540562	1782.0854925	6.573
GLM	620.8814844	735.9622856	789.7891737	0.241
DL	213.8582644	454.7871732	476.5995571	7.518