In [1]:

import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [2]:

# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()

Connecting to H2O server at http://localhost:54321... successful!

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:90: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.
  def _ipython_display_formatter_default(self):
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:96: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead.
  def _formatters_default(self):
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.
  def _type_printers_default(self):
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.
  def _deferred_printers_default(self):

H2O cluster uptime:	2 mins 28 secs
H2O cluster version:	3.9.1.99999
H2O cluster name:	H2O_from_python_navdeepgill_t343ab
H2O cluster total nodes:	1
H2O cluster free memory:	3.244 Gb
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster is healthy:	True
H2O cluster is locked:	True
H2O connection url:	http://localhost:54321
H2O connection proxy:	None
Python version:	2.7.11 final

In [5]:

from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False

def mylocate(s):
    if data_source_is_s3:
        return "s3n://h2o-public-test-data/" + s
    else:
        return _locate(s)

In [6]:

# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test =   [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print("Import and Parse bike data")
data = h2o.import_file(path=small_test)

Import and Parse bike data
Warning: Method get_json in class H2OConnection is deprecated.

---------------------------------------------------------------------------
H2OResponseError                          Traceback (most recent call last)
<ipython-input-6-4d7c875f5051> in <module>()
     23 # totals about 10 million rows
     24 print("Import and Parse bike data")
---> 25 data = h2o.import_file(path=small_test)

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc in import_file(path, destination_frame, parse, header, sep, col_names, col_types, na_strings)
    336 
    337     return H2OFrame()._import_parse(path, destination_frame, header, sep, col_names,
--> 338                                     col_types, na_strings)
    339 
    340 

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/frame.pyc in _import_parse(self, path, destination_frame, header, separator, column_names, column_types, na_strings)
    196 
    197   def _import_parse(self, path, destination_frame, header, separator, column_names, column_types, na_strings):
--> 198     rawkey = h2o.lazy_import(path)
    199     self._parse(rawkey,destination_frame, header, separator, column_names, column_types, na_strings)
    200     return self

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc in lazy_import(path)
    213         A path to a data file (remote or local).
    214     """
--> 215     return [_import(p)[0] for p in path] if isinstance(path, (list, tuple)) else _import(path)
    216 
    217 

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc in _import(path)
    217 
    218 def _import(path):
--> 219     j = h2oconn.get_json(url_suffix="ImportFiles", path=path)
    220     if j['fails']: raise ValueError("ImportFiles of " + path + " failed on " + str(j['fails']))
    221     return j['destination_frames']

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/utils/backward_compatibility.pyc in <lambda>(*args, **kwargs)
     64         self._bcin = {
     65             # Creating lambdas in a loop, need to make sure that `fun` is bound to each lambda separately.
---> 66             name: (lambda fun: lambda *args, **kwargs: fun(self, *args, **kwargs))(fun)
     67             for name, fun in viewitems(self._bc["im"])
     68         }

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in <lambda>(*args, **kwargs)
    665         "post": lambda *args, **kwargs: _deprecated_post(*args, **kwargs),
    666         "delete": lambda *args, **kwargs: _deprecated_delete(*args, **kwargs),
--> 667         "get_json": lambda *args, **kwargs: _deprecated_get(*args, **kwargs),
    668         "post_json": lambda *args, **kwargs: _deprecated_post(*args, **kwargs),
    669     }

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in _deprecated_get(self, url_suffix, **kwargs)
   1155     restver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else 3
   1156     endpoint = "GET /%d/%s" % (restver, url_suffix)
-> 1157     return self.request(endpoint, data=kwargs)
   1158 
   1159 def _deprecated_post(self, url_suffix, **kwargs):

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in request(self, endpoint, data, json, filename)
    232                                     auth=self._auth, verify=self._verify_ssl_cert, proxies=self._proxies)
    233             self._log_end_transaction(start_time, resp)
--> 234             return self._process_response(resp)
    235 
    236         except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:

/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in _process_response(response)
    585         # Client errors (400 = "Bad Request", 404 = "Not Found", 412 = "Precondition Failed")
    586         if status_code in {400, 404, 412} and isinstance(data, (H2OErrorV3, H2OModelBuilderErrorV3)):
--> 587             raise H2OResponseError(data)
    588 
    589         # Server errors (notably 500 = "Server Error")

H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3n URL, or by setting the fs.s3n.awsAccessKeyId or fs.s3n.awsSecretAccessKey properties (respectively).
  Request: GET /3/ImportFiles
    params: {'path': 's3n://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv'}

In [5]:

# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay=1000*60*60*24
data["Days"] = (startime/secsPerDay).floor()
data.describe()

Rows:1,037,712 Cols:16

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	17	2.2135415	1.3 KB	0.0022872
C1	1-Byte Integers	48	6.25	1016.6 KB	1.7506603
C1N	1-Byte Integers (w/o NAs)	48	6.25	1016.6 KB	1.7506603
C1S	1-Byte Fractions	79	10.286459	1.6 MB	2.8878725
C2	2-Byte Integers	243	31.640625	10.0 MB	17.696283
C2S	2-Byte Fractions	49	6.3802085	2.0 MB	3.5701983
C4	4-Byte Integers	32	4.166667	2.6 MB	4.6726856
C4S	4-Byte Fractions	39	5.078125	3.2 MB	5.6373096
C8	64-bit Integers	60	7.8125	9.9 MB	17.432673
C8D	64-bit Reals	153	19.921875	25.3 MB	44.59937

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.61:54321	56.7 MB	1037712.0	48.0	768.0
mean	56.7 MB	1037712.0	48.0	768.0
min	56.7 MB	1037712.0	48.0	768.0
max	56.7 MB	1037712.0	48.0	768.0
stddev	0 B	0.0	0.0	0.0
total	56.7 MB	1037712.0	48.0	768.0

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	bikeid	usertype	birth year	gender	Days
type	int	time	time	int	enum	real	real	int	enum	real	real	int	enum	int	int	int
mins	60.0	1.380610868e+12	1.380611083e+12	72.0	0.0	40.680342423	-74.01713445	72.0	0.0	40.680342423	-74.01713445	14529.0	0.0	1899.0	0.0	15979.0
mean	825.614754383	1.38191371692e+12	1.38191454253e+12	443.714212614	NaN	40.7345188586	-73.9911328848	443.207421712	NaN	40.7342847885	-73.9912702982	17644.0716451	0.906095332809	1975.77839486	1.12375591686	15993.8523906
maxs	1259480.0	1.383289197e+12	1.38341851e+12	3002.0	329.0	40.770513	-73.9500479759	3002.0	329.0	40.770513	-73.9500479759	20757.0	1.0	1997.0	2.0	16010.0
sigma	2000.3732323	778871729.132	778847387.503	354.434325075	NaN	0.0195734073053	0.0123161234106	357.398217058	NaN	0.0195578458116	0.0123855811965	1717.68112134	0.291696182123	11.1314906238	0.544380593291	9.02215033588
zeros	0	0	0	0	5239	0	0	0	5449	0	0	0	97446	0	97498	0
missing	0	0	0	0	0	0	0	0	0	0	0	0	0	97445	0	0
0	326.0	1.380610868e+12	1.380611194e+12	239.0	Willoughby St & Fleet St	40.69196566	-73.9813018	366.0	Clinton Ave & Myrtle Ave	40.693261	-73.968896	16052.0	Subscriber	1982.0	1.0	15979.0
1	729.0	1.380610881e+12	1.38061161e+12	322.0	Clinton St & Tillary St	40.696192	-73.991218	398.0	Atlantic Ave & Furman St	40.69165183	-73.9999786	19412.0	Customer	nan	0.0	15979.0
2	520.0	1.380610884e+12	1.380611404e+12	174.0	E 25 St & 1 Ave	40.7381765	-73.97738662	403.0	E 2 St & 2 Ave	40.72502876	-73.99069656	19645.0	Subscriber	1984.0	1.0	15979.0
3	281.0	1.380610885e+12	1.380611166e+12	430.0	York St & Jay St	40.7014851	-73.98656928	323.0	Lawrence St & Willoughby St	40.69236178	-73.98631746	16992.0	Subscriber	1985.0	1.0	15979.0
4	196.0	1.380610887e+12	1.380611083e+12	403.0	E 2 St & 2 Ave	40.72502876	-73.99069656	401.0	Allen St & Rivington St	40.72019576	-73.98997825	15690.0	Subscriber	1986.0	1.0	15979.0
5	1948.0	1.380610908e+12	1.380612856e+12	369.0	Washington Pl & 6 Ave	40.73224119	-74.00026394	307.0	Canal St & Rutgers St	40.71427487	-73.98990025	19846.0	Subscriber	1977.0	1.0	15979.0
6	1327.0	1.380610908e+12	1.380612235e+12	254.0	W 11 St & 6 Ave	40.73532427	-73.99800419	539.0	Metropolitan Ave & Bedford Ave	40.71534825	-73.96024116	14563.0	Subscriber	1986.0	2.0	15979.0
7	1146.0	1.380610917e+12	1.380612063e+12	490.0	8 Ave & W 33 St	40.751551	-73.993934	438.0	St Marks Pl & 1 Ave	40.72779126	-73.98564945	16793.0	Subscriber	1959.0	1.0	15979.0
8	380.0	1.380610918e+12	1.380611298e+12	468.0	Broadway & W 55 St	40.7652654	-73.98192338	385.0	E 55 St & 2 Ave	40.75797322	-73.96603308	16600.0	Customer	nan	0.0	15979.0
9	682.0	1.380610925e+12	1.380611607e+12	300.0	Shevchenko Pl & E 6 St	40.728145	-73.990214	519.0	Pershing Square N	40.75188406	-73.97770164	15204.0	Subscriber	1992.0	1.0	15979.0

In [6]:

# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
grouped = data.group_by(["Days","start station name"])
bpd = grouped.count().get_frame() # Compute bikes-per-day
bpd.set_name(2,"bikes")
bpd.show()
bpd.describe()
bpd.dim

Days	start station name	bikes
15979	1 Ave & E 15 St	97
15979	1 Ave & E 18 St	75
15979	1 Ave & E 30 St	113
15979	10 Ave & W 28 St	74
15979	11 Ave & W 27 St	139
15979	11 Ave & W 41 St	60
15979	12 Ave & W 40 St	90
15979	2 Ave & E 31 St	88
15979	2 Ave & E 58 St	55
15979	3 Ave & Schermerhorn St	8

Rows:10,450 Cols:3

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	1	1.0416667	80 B	0.1364815
C1N	1-Byte Integers (w/o NAs)	1	1.0416667	412 B	0.7028798
C1S	1-Byte Fractions	31	32.291664	12.4 KB	21.714207
C2	2-Byte Integers	63	65.625	44.3 KB	77.446434

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.61:54321	57.2 KB	10450.0	32.0	96.0
mean	57.2 KB	10450.0	32.0	96.0
min	57.2 KB	10450.0	32.0	96.0
max	57.2 KB	10450.0	32.0	96.0
stddev	0 B	0.0	0.0	0.0
total	57.2 KB	10450.0	32.0	96.0

	Days	start station name	bikes
type	int	enum	int
mins	15979.0	0.0	1.0
mean	15994.4415311	NaN	99.3025837321
maxs	16010.0	329.0	553.0
sigma	9.23370172444	NaN	72.9721964301
zeros	0	32	0
missing	0	0	0
0	15979.0	1 Ave & E 15 St	97.0
1	15979.0	1 Ave & E 18 St	75.0
2	15979.0	1 Ave & E 30 St	113.0
3	15979.0	10 Ave & W 28 St	74.0
4	15979.0	11 Ave & W 27 St	139.0
5	15979.0	11 Ave & W 41 St	60.0
6	15979.0	12 Ave & W 40 St	90.0
7	15979.0	2 Ave & E 31 St	88.0
8	15979.0	2 Ave & E 58 St	55.0
9	15979.0	3 Ave & Schermerhorn St	8.0

Out[6]:

[10450, 3]

In [7]:

# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print("Quantiles of bikes-per-day")
bpd["bikes"].quantile().show()

Quantiles of bikes-per-day

Probs	bikesQuantiles
0.01	4.49
0.1	19
0.25	43
0.333	57
0.5	87
0.667	118
0.75	137
0.9	192
0.99	334.51

In [8]:

# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"]     = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print("Bikes-Per-Day")
bpd.describe()

Bikes-Per-Day
Rows:10,450 Cols:5

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	33	20.625	2.6 KB	3.6613781
CBS	Bits	6	3.7500002	666 B	0.9236658
C1N	1-Byte Integers (w/o NAs)	27	16.875	10.4 KB	14.803617
C1S	1-Byte Fractions	31	19.375	12.4 KB	17.65228
C2	2-Byte Integers	63	39.375	44.3 KB	62.959057

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.61:54321	70.4 KB	10450.0	32.0	160.0
mean	70.4 KB	10450.0	32.0	160.0
min	70.4 KB	10450.0	32.0	160.0
max	70.4 KB	10450.0	32.0	160.0
stddev	0 B	0.0	0.0	0.0
total	70.4 KB	10450.0	32.0	160.0

	Days	start station name	bikes	Month	DayOfWeek
type	int	enum	int	enum	enum
mins	15979.0	0.0	1.0	0.0	0.0
mean	15994.4415311	NaN	99.3025837321	0.968612440191	NaN
maxs	16010.0	329.0	553.0	1.0	6.0
sigma	9.23370172444	NaN	72.9721964301	0.174371128617	NaN
zeros	0	32	0	328	1635
missing	0	0	0	0	0
0	15979.0	1 Ave & E 15 St	97.0	9	Mon
1	15979.0	1 Ave & E 18 St	75.0	9	Mon
2	15979.0	1 Ave & E 30 St	113.0	9	Mon
3	15979.0	10 Ave & W 28 St	74.0	9	Mon
4	15979.0	11 Ave & W 27 St	139.0	9	Mon
5	15979.0	11 Ave & W 41 St	60.0	9	Mon
6	15979.0	12 Ave & W 40 St	90.0	9	Mon
7	15979.0	2 Ave & E 31 St	88.0	9	Mon
8	15979.0	2 Ave & E 58 St	55.0	9	Mon
9	15979.0	3 Ave & Schermerhorn St	8.0	9	Mon

In [9]:

# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
  global gbm0,drf0,glm0,dl0
  # Classic Test/Train split
  r = data['Days'].runif()   # Random UNIForm numbers, one per row
  train = data[  r  < 0.6]
  test  = data[(0.6 <= r) & (r < 0.9)]
  hold  = data[ 0.9 <= r ]
  print("Training data has",train.ncol,"columns and",train.nrow,"rows, test has",test.nrow,"rows, holdout has",hold.nrow)
  bike_names_x = data.names
  bike_names_x.remove("bikes")
  
  # Run GBM
  s = time.time()
  
  gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well
                                      max_depth=6,
                                      learn_rate=0.1)
    

  gbm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  gbm_elapsed = time.time() - s

  # Run DRF
  s = time.time()
    
  drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)

  drf0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)
    
  drf_elapsed = time.time() - s 
    
    
  # Run GLM
  if "WC1" in bike_names_x: bike_names_x.remove("WC1")
  s = time.time()

  glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
    
  glm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  glm_elapsed = time.time() - s
  
  # Run DL
  s = time.time()

  dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)
    
  dl0.train(x               =bike_names_x,
            y               ="bikes",
            training_frame  =train,
            validation_frame=test)
    
  dl_elapsed = time.time() - s
  
  # ----------
  # 4- Score on holdout set & report
  train_mse_gbm = gbm0.model_performance(train).mse()
  test_mse_gbm  = gbm0.model_performance(test ).mse()
  hold_mse_gbm  = gbm0.model_performance(hold ).mse()
#   print "GBM mse TRAIN=",train_mse_gbm,", mse TEST=",test_mse_gbm,", mse HOLDOUT=",hold_mse_gbm
  
  train_mse_drf = drf0.model_performance(train).mse()
  test_mse_drf  = drf0.model_performance(test ).mse()
  hold_mse_drf  = drf0.model_performance(hold ).mse()
#   print "DRF mse TRAIN=",train_mse_drf,", mse TEST=",test_mse_drf,", mse HOLDOUT=",hold_mse_drf
  
  train_mse_glm = glm0.model_performance(train).mse()
  test_mse_glm  = glm0.model_performance(test ).mse()
  hold_mse_glm  = glm0.model_performance(hold ).mse()
#   print "GLM mse TRAIN=",train_mse_glm,", mse TEST=",test_mse_glm,", mse HOLDOUT=",hold_mse_glm
    
  train_mse_dl = dl0.model_performance(train).mse()
  test_mse_dl  = dl0.model_performance(test ).mse()
  hold_mse_dl  = dl0.model_performance(hold ).mse()
#   print " DL mse TRAIN=",train_mse_dl,", mse TEST=",test_mse_dl,", mse HOLDOUT=",hold_mse_dl
    
  # make a pretty HTML table printout of the results

  header = ["Model", "mse TRAIN", "mse TEST", "mse HOLDOUT", "Model Training Time (s)"]
  table  = [
            ["GBM", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],
            ["DRF", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],
            ["GLM", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],
            ["DL ", train_mse_dl,  test_mse_dl,  hold_mse_dl , round(dl_elapsed,3) ],
           ]
  h2o.display.H2ODisplay(table,header)
  # --------------

In [10]:

# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.

Training data has 5 columns and 6172 rows, test has 3238 rows, holdout has 1040

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

Model	R2 TRAIN	R2 TEST	R2 HOLDOUT	Model Training Time (s)
GBM	0.9976981	0.9274821	0.9183267	5.612
DRF	0.8294274	0.7694496	0.7611063	5.607
GLM	0.8597208	0.8465429	0.8447966	0.14
DL	0.9546943	0.9115880	0.8978001	6.845

In [11]:

# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
                               mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()

Parse Progress: [##################################################] 100%
Rows:17,520 Cols:50

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	107	6.294118	8.4 KB	0.7889721
C0D	Constant Reals	436	25.647058	34.1 KB	3.2148771
CXI	Sparse Integers	17	1.0	1.5 KB	0.1399135
C1	1-Byte Integers	346	20.352942	197.4 KB	18.634672
C1N	1-Byte Integers (w/o NAs)	214	12.588236	122.3 KB	11.544063
C1S	1-Byte Fractions	214	12.588236	125.3 KB	11.822968
C2S	2-Byte Fractions	196	11.529412	214.5 KB	20.242111
C4S	4-Byte Fractions	170	10.0	356.1 KB	33.612423

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.61:54321	1.0 MB	17520.0	34.0	1700.0
mean	1.0 MB	17520.0	34.0	1700.0
min	1.0 MB	17520.0	34.0	1700.0
max	1.0 MB	17520.0	34.0	1700.0
stddev	0 B	0.0	0.0	0.0
total	1.0 MB	17520.0	34.0	1700.0

	Year Local	Month Local	Day Local	Hour Local	Year UTC	Month UTC	Day UTC	Hour UTC	Cavok Reported	Cloud Ceiling (m)	Cloud Cover Fraction	Cloud Cover Fraction 1	Cloud Cover Fraction 2	Cloud Cover Fraction 3	Cloud Cover Fraction 4	Cloud Cover Fraction 5	Cloud Cover Fraction 6	Cloud Height (m) 1	Cloud Height (m) 2	Cloud Height (m) 3	Cloud Height (m) 4	Cloud Height (m) 5	Cloud Height (m) 6	Dew Point (C)	Humidity Fraction	Precipitation One Hour (mm)	Pressure Altimeter (mbar)	Pressure Sea Level (mbar)	Pressure Station (mbar)	Snow Depth (cm)	Temperature (C)	Visibility (km)	Weather Code 1	Weather Code 1/ Description	Weather Code 2	Weather Code 2/ Description	Weather Code 3	Weather Code 3/ Description	Weather Code 4	Weather Code 4/ Description	Weather Code 5	Weather Code 5/ Description	Weather Code 6	Weather Code 6/ Description	Weather Code Most Severe / Icon Code	Weather Code Most Severe	Weather Code Most Severe / Description	Wind Direction (degrees)	Wind Gust (m/s)	Wind Speed (m/s)
type	int	int	int	int	int	int	int	int	int	real	real	real	real	real	int	int	int	real	real	real	int	int	int	real	real	real	real	int	int	int	real	real	int	enum	int	enum	int	enum	int	enum	int	enum	int	enum	int	int	enum	int	real	real
mins	2013.0	1.0	1.0	0.0	2013.0	1.0	1.0	0.0	0.0	61.0	0.0	0.0	0.25	0.5	NaN	NaN	NaN	60.96	213.36	365.76	NaN	NaN	NaN	-26.7	0.1251	0.0	983.2949	NaN	NaN	NaN	-15.6	0.001	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	3.0	0.0	0.0	1.0	0.0	10.0	7.2	0.0
mean	2013.5	6.52602739726	15.7205479452	11.5	2013.50057078	6.52511415525	15.721347032	11.5001141553	0.0	1306.31195846	0.416742490522	0.361207349081	0.872445384073	0.963045685279	0.0	0.0	0.0	1293.9822682	1643.73900166	2084.89386376	0.0	0.0	0.0	4.31304646766	0.596736389159	1.37993010753	1017.82581441	0.0	0.0	0.0	12.5789090701	14.3914429682	4.84251968504	NaN	3.65867689358	NaN	2.84660766962	NaN	2.01149425287	NaN	4.125	NaN	3.0	0.0	1.37848173516	4.84251968504	NaN	194.69525682	9.42216948073	2.41032887849
maxs	2014.0	12.0	31.0	23.0	2015.0	12.0	31.0	23.0	0.0	3657.6	1.0	1.0	1.0	1.0	NaN	NaN	NaN	3657.5999	3657.5999	3657.5999	NaN	NaN	NaN	24.4	1.0	26.924	1042.2113	NaN	NaN	NaN	36.1	16.0934	60.0	11.0	60.0	10.0	36.0	7.0	27.0	4.0	27.0	2.0	3.0	0.0	16.0	60.0	11.0	360.0	20.58	10.8
sigma	0.500014270017	3.44794972385	8.79649804852	6.92238411188	0.500584411716	3.44782405458	8.79561488868	6.92230165203	0.0	995.339856966	0.462720830993	0.42770569708	0.197155690367	0.0861015598104	-0.0	-0.0	-0.0	962.743095854	916.73861349	887.215847511	-0.0	-0.0	-0.0	10.9731282097	0.185792011866	2.56215129179	7.46451697179	-0.0	-0.0	-0.0	10.0396739531	3.69893623033	5.70486576983	NaN	6.13386253912	NaN	5.80553286364	NaN	3.12340844261	NaN	6.15223536611	NaN	0.0	0.0	4.07386062702	5.70486576983	NaN	106.350000031	1.81511871115	1.61469790524
zeros	0	0	0	730	0	0	0	730	17455	0	8758	8758	0	0	-17520	-17520	-17520	0	0	0	-17520	-17520	-17520	268	0	501	0	-17520	-17520	-17520	269	0	0	17	0	30	0	13	-5044	-5024	-11241	-11229	-17030	-17028	14980	0	17	0	0	2768
missing	0	0	0	0	0	0	0	0	65	10780	375	375	14682	16535	17520	17520	17520	9103	14683	16535	17520	17520	17520	67	67	15660	360	17520	17520	17520	67	412	14980	14980	16477	16477	17181	17181	17433	17433	17504	17504	17518	17518	0	14980	14980	9382	14381	1283
0	2013.0	1.0	1.0	0.0	2013.0	1.0	1.0	5.0	0.0	2895.6	1.0	0.9	1.0	nan	nan	nan	nan	2895.5999	3352.8	nan	nan	nan	nan	-5.0	0.5447	nan	1013.0917	nan	nan	nan	3.3	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	nan	2.57
1	2013.0	1.0	1.0	1.0	2013.0	1.0	1.0	6.0	0.0	3048.0	1.0	1.0	nan	nan	nan	nan	nan	3048.0	nan	nan	nan	nan	nan	-4.4	0.5463	nan	1012.0759	nan	nan	nan	3.9	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	9.77	4.63
2	2013.0	1.0	1.0	2.0	2013.0	1.0	1.0	7.0	0.0	1828.8	1.0	1.0	nan	nan	nan	nan	nan	1828.7999	nan	nan	nan	nan	nan	-3.3	0.619	nan	1012.4145	nan	nan	nan	3.3	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	7.72	1.54
3	2013.0	1.0	1.0	3.0	2013.0	1.0	1.0	8.0	0.0	1463.0	1.0	1.0	nan	nan	nan	nan	nan	1463.04	nan	nan	nan	nan	nan	-2.8	0.6159	nan	1012.4145	nan	nan	nan	3.9	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	nan	3.09
4	2013.0	1.0	1.0	4.0	2013.0	1.0	1.0	9.0	0.0	1402.1	1.0	1.0	nan	nan	nan	nan	nan	1402.08	nan	nan	nan	nan	nan	-2.8	0.6159	nan	1012.7531	nan	nan	nan	3.9	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	nan	4.12
5	2013.0	1.0	1.0	5.0	2013.0	1.0	1.0	10.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-2.8	0.6159	nan	1012.4145	nan	nan	nan	3.9	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	nan	3.09
6	2013.0	1.0	1.0	6.0	2013.0	1.0	1.0	11.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-3.3	0.5934	nan	1012.0759	nan	nan	nan	3.9	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	9.26	3.09
7	2013.0	1.0	1.0	7.0	2013.0	1.0	1.0	12.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-3.3	0.5934	nan	1012.4145	nan	nan	nan	3.9	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	9.26	4.63
8	2013.0	1.0	1.0	8.0	2013.0	1.0	1.0	13.0	0.0	1524.0	1.0	1.0	nan	nan	nan	nan	nan	1524.0	nan	nan	nan	nan	nan	-2.8	0.6425	nan	1012.4145	nan	nan	nan	3.3	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		260.0	nan	3.09
9	2013.0	1.0	1.0	9.0	2013.0	1.0	1.0	14.0	0.0	1524.0	1.0	0.9	1.0	nan	nan	nan	nan	1524.0	3657.5999	nan	nan	nan	nan	-2.8	0.6159	nan	1012.4145	nan	nan	nan	3.9	16.0934	nan		nan		nan		nan		nan		nan		0.0	nan		nan	9.26	3.09

In [12]:

# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]

wthr2.set_name(wthr2.names.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.names.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!  

Rows:17,520 Cols:9

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	46	15.0326805	3.6 KB	1.780005
C1	1-Byte Integers	34	11.111112	19.4 KB	9.592678
C1N	1-Byte Integers (w/o NAs)	90	29.411766	51.5 KB	25.494701
C1S	1-Byte Fractions	42	13.725491	24.0 KB	11.894592
C2S	2-Byte Fractions	94	30.718956	103.4 KB	51.238026

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.61:54321	201.9 KB	17520.0	34.0	306.0
mean	201.9 KB	17520.0	34.0	306.0
min	201.9 KB	17520.0	34.0	306.0
max	201.9 KB	17520.0	34.0	306.0
stddev	0 B	0.0	0.0	0.0
total	201.9 KB	17520.0	34.0	306.0

	Year Local	Month Local	Day Local	Hour Local	Dew Point (C)	Humidity Fraction	Rain (mm)	Temperature (C)	WC1
type	int	int	int	int	real	real	real	real	enum
mins	2013.0	1.0	1.0	0.0	-26.7	0.1251	0.0	-15.6	0.0
mean	2013.5	6.52602739726	15.7205479452	11.5	4.31304646766	0.596736389159	1.37993010753	12.5789090701	NaN
maxs	2014.0	12.0	31.0	23.0	24.4	1.0	26.924	36.1	11.0
sigma	0.500014270017	3.44794972385	8.79649804852	6.92238411188	10.9731282097	0.185792011866	2.56215129179	10.0396739531	NaN
zeros	0	0	0	730	268	0	501	269	17
missing	0	0	0	0	67	67	15660	67	14980
0	2013.0	1.0	1.0	0.0	-5.0	0.5447	nan	3.3
1	2013.0	1.0	1.0	1.0	-4.4	0.5463	nan	3.9
2	2013.0	1.0	1.0	2.0	-3.3	0.619	nan	3.3
3	2013.0	1.0	1.0	3.0	-2.8	0.6159	nan	3.9
4	2013.0	1.0	1.0	4.0	-2.8	0.6159	nan	3.9
5	2013.0	1.0	1.0	5.0	-2.8	0.6159	nan	3.9
6	2013.0	1.0	1.0	6.0	-3.3	0.5934	nan	3.9
7	2013.0	1.0	1.0	7.0	-3.3	0.5934	nan	3.9
8	2013.0	1.0	1.0	8.0	-2.8	0.6425	nan	3.3
9	2013.0	1.0	1.0	9.0	-2.8	0.6159	nan	3.9

In [13]:

# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]

In [14]:

# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.  

Rows:730 Cols:11

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	80	21.390373	6.3 KB	12.498779
C0D	Constant Reals	13	3.4759357	1.0 KB	2.0310516
C1	1-Byte Integers	30	8.021391	2.6 KB	5.2455816
C1N	1-Byte Integers (w/o NAs)	56	14.973262	4.9 KB	9.801778
C1S	1-Byte Fractions	34	9.090909	3.5 KB	7.0032225
C2S	2-Byte Fractions	34	9.090909	4.2 KB	8.4288645
CUD	Unique Reals	25	6.6844916	3.6 KB	7.2297626
C8D	64-bit Reals	102	27.272728	23.9 KB	47.76096

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.61:54321	50.0 KB	730.0	34.0	374.0
mean	50.0 KB	730.0	34.0	374.0
min	50.0 KB	730.0	34.0	374.0
max	50.0 KB	730.0	34.0	374.0
stddev	0 B	0.0	0.0	0.0
total	50.0 KB	730.0	34.0	374.0

	Year Local	Month Local	Day Local	Hour Local	Dew Point (C)	Humidity Fraction	Rain (mm)	Temperature (C)	WC1	msec	Days
type	int	int	int	int	real	real	real	real	enum	int	int
mins	2013.0	1.0	1.0	12.0	-26.7	0.1723	0.0	-13.9	0.0	1.3570704e+12	15706.0
mean	2013.5	6.52602739726	15.7205479452	12.0	4.23012379642	0.539728198074	1.53125714286	14.0687757909	NaN	1.3885608526e+12	16070.5
maxs	2014.0	12.0	31.0	12.0	23.3	1.0	12.446	34.4	10.0	1.420056e+12	16435.0
sigma	0.500342818004	3.45021529307	8.80227802701	0.0	11.1062964725	0.179945027923	2.36064248615	10.3989855149	NaN	18219740080.4	210.877136425
zeros	0	0	0	0	14	0	-174	7	-83	0	0
missing	0	0	0	0	3	3	660	3	620	0	0
0	2013.0	1.0	1.0	12.0	-3.3	0.5934	nan	3.9		1.3570704e+12	15706.0
1	2013.0	1.0	2.0	12.0	-11.7	0.4806	nan	-2.2		1.3571568e+12	15707.0
2	2013.0	1.0	3.0	12.0	-10.6	0.5248	nan	-2.2		1.3572432e+12	15708.0
3	2013.0	1.0	4.0	12.0	-7.2	0.4976	nan	2.2		1.3573296e+12	15709.0
4	2013.0	1.0	5.0	12.0	-7.2	0.426	nan	4.4		1.357416e+12	15710.0
5	2013.0	1.0	6.0	12.0	-1.7	0.6451	nan	4.4	haze	1.3575024e+12	15711.0
6	2013.0	1.0	7.0	12.0	-6.1	0.4119	nan	6.1		1.3575888e+12	15712.0
7	2013.0	1.0	8.0	12.0	-1.7	0.5314	nan	7.2		1.3576752e+12	15713.0
8	2013.0	1.0	9.0	12.0	0.6	0.56	nan	8.9	haze	1.3577616e+12	15714.0
9	2013.0	1.0	10.0	12.0	-6.1	0.3952	nan	6.7		1.357848e+12	15715.0

In [15]:

# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")

In [16]:

# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0
wthr4["Rain (mm)"] = rain

In [17]:

# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print("Merge Daily Weather with Bikes-Per-Day")
bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)
bpd_with_weather.describe()
bpd_with_weather.show()

Merge Daily Weather with Bikes-Per-Day
Rows:10,450 Cols:10

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	66	20.625	5.2 KB	3.6253278
C0D	Constant Reals	33	10.3125	2.6 KB	1.8126639
CBS	Bits	6	1.8750001	666 B	0.4572857
C1	1-Byte Integers	4	1.25	1.5 KB	1.0821055
C1N	1-Byte Integers (w/o NAs)	28	8.75	10.8 KB	7.599456
C1S	1-Byte Fractions	31	9.6875	12.4 KB	8.739238
C2	2-Byte Integers	63	19.6875	44.3 KB	31.169582
CUD	Unique Reals	89	27.812498	64.7 KB	45.514343

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.61:54321	142.2 KB	10450.0	32.0	320.0
mean	142.2 KB	10450.0	32.0	320.0
min	142.2 KB	10450.0	32.0	320.0
max	142.2 KB	10450.0	32.0	320.0
stddev	0 B	0.0	0.0	0.0
total	142.2 KB	10450.0	32.0	320.0

	Days	start station name	bikes	Month	DayOfWeek	Humidity Fraction	Rain (mm)	Temperature (C)	WC1	Dew Point (C)
type	int	enum	int	enum	enum	real	int	real	enum	real
mins	15979.0	0.0	1.0	0.0	0.0	0.3485	0.0	9.4	2.0	-2.2
mean	15994.4415311	NaN	99.3025837321	0.968612440191	NaN	0.562374191388	0.0	16.9630717703	NaN	7.77999043062
maxs	16010.0	329.0	553.0	1.0	6.0	0.8718	0.0	26.1	8.0	19.4
sigma	9.23370172444	NaN	72.9721964301	0.174371128617	NaN	0.149631413472	0.0	4.29746634617	NaN	6.49151146664
zeros	0	32	0	328	1635	0	10450	0	-8494	0
missing	0	0	0	0	0	0	0	0	9134	0
0	15979.0	1 Ave & E 15 St	97.0	9	Mon	0.4315	0.0	23.9		10.6
1	15979.0	1 Ave & E 18 St	75.0	9	Mon	0.4315	0.0	23.9		10.6
2	15979.0	1 Ave & E 30 St	113.0	9	Mon	0.4315	0.0	23.9		10.6
3	15979.0	10 Ave & W 28 St	74.0	9	Mon	0.4315	0.0	23.9		10.6
4	15979.0	11 Ave & W 27 St	139.0	9	Mon	0.4315	0.0	23.9		10.6
5	15979.0	11 Ave & W 41 St	60.0	9	Mon	0.4315	0.0	23.9		10.6
6	15979.0	12 Ave & W 40 St	90.0	9	Mon	0.4315	0.0	23.9		10.6
7	15979.0	2 Ave & E 31 St	88.0	9	Mon	0.4315	0.0	23.9		10.6
8	15979.0	2 Ave & E 58 St	55.0	9	Mon	0.4315	0.0	23.9		10.6
9	15979.0	3 Ave & Schermerhorn St	8.0	9	Mon	0.4315	0.0	23.9		10.6

Days	start station name	bikes	Month	DayOfWeek	Humidity Fraction	Temperature (C)	Dew Point (C)
15979	1 Ave & E 15 St	97	9	Mon	0.4315	23.9	10.6
15979	1 Ave & E 18 St	75	9	Mon	0.4315	23.9	10.6
15979	1 Ave & E 30 St	113	9	Mon	0.4315	23.9	10.6
15979	10 Ave & W 28 St	74	9	Mon	0.4315	23.9	10.6
15979	11 Ave & W 27 St	139	9	Mon	0.4315	23.9	10.6
15979	11 Ave & W 41 St	60	9	Mon	0.4315	23.9	10.6
15979	12 Ave & W 40 St	90	9	Mon	0.4315	23.9	10.6
15979	2 Ave & E 31 St	88	9	Mon	0.4315	23.9	10.6
15979	2 Ave & E 58 St	55	9	Mon	0.4315	23.9	10.6
15979	3 Ave & Schermerhorn St	8	9	Mon	0.4315	23.9	10.6

In [18]:

# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)

Training data has 10 columns and 6289 rows, test has 3080 rows, holdout has 1081

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

Model	R2 TRAIN	R2 TEST	R2 HOLDOUT	Model Training Time (s)
GBM	0.9954410	0.9255962	0.9230051	6.706
DRF	0.8491125	0.7430226	0.7442895	6.692
GLM	0.8660565	0.8446801	0.8673705	0.139
DL	0.9617874	0.9117793	0.9213475	7.972